/lustre06/project/6004736/alvann/from_narval/REFERENCES/Skeletal_Muscle_Tissue_hg/out/02_gene_sets/skeletal_muscle_atlas_gene_sets.n100.Rda

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
library(ComplexHeatmap)
## Loading required package: grid
## ========================================
## ComplexHeatmap version 2.10.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
## 
## If you use it in published research, please cite:
## Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
##   genomic data. Bioinformatics 2016.
## 
## The new InteractiveComplexHeatmap package can directly export static 
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
library(ggrepel)
library(BayesPrism)
## Loading required package: snowfall
## Loading required package: snow
## Loading required package: NMF
## Loading required package: pkgmaker
## Loading required package: registry
## Loading required package: rngtools
## Loading required package: cluster
## NMF - BioConductor layer [OK] | Shared memory capabilities [NO: bigmemory] | Cores 63/64
##   To enable shared memory capabilities, try: install.extras('
## NMF
## ')
## Warning: replacing previous import 'gplots::lowess' by 'stats::lowess' when
## loading 'BayesPrism'
## Warning: replacing previous import 'BiocParallel::register' by 'NMF::register'
## when loading 'BayesPrism'
dir.create(params$out_path)
## Warning in dir.create(params$out_path):
## '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/figures/ssGSEA'
## already exists
plot_geneset = function(id, group_pal = c('WT' = 'blue', 'EZHIP' = 'red')){

id_plot = str_replace_all(id, ' ', '_')
id_plot = str_replace_all(id_plot, '[.]', '_')
id_plot = str_replace_all(id_plot, '[/]', '_')

p = full_join(
ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
theta %>%
  rownames_to_column('ID') %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'Deconvolution') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
by = c('sample', 'ID', 'Group')
)  %>%
    select(ID, sample, Group, ssGSEA, Deconvolution) %>%
  filter(ID == id) %>%
  pivot_longer(-c(ID, sample, Group)) %>%
  mutate(name = factor(name, levels = c('ssGSEA', 'Deconvolution')),
         Group = factor(Group, levels = names(group_pal))) %>% 
ggplot(aes(Group, value, color = Group)) +
  geom_point(size = 5, alpha = 0.8) +
  scale_color_manual(values = group_pal) +
  theme_bw() +
  theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(),  
        axis.title = element_blank(), 
        legend.position = 'none', 
        strip.background = element_blank()) +
  facet_wrap(.~name, scales = 'free_y') +
  ggtitle(id)

p %>% print 

pdf(paste0(params$out_path, sample_prefix, '.', atlas_prefix, '.', id_plot, '.dotplot.pdf'))
p %>% print 
dev.off()

genes = (deg %>% 
  filter(ENS %in% unlist(atlas[['hg_ens']][id])) %>%
  filter(padj < 0.05 & log2FoldChange > 0))$ENS

p = counts %>% 
  filter(ENS %in% genes) %>%
  select(-ENS) %>%
  pivot_longer(-SYM) %>%
  left_join(meta, by = c('name' = 'Nickname')) %>%
  mutate(Group = factor(Group, levels = names(group_pal))) %>%
  ggplot(aes(Group, value, color = Group)) +
  geom_point(size = 2, alpha = 0.8) +
  facet_wrap(.~SYM, scales = 'free_y') +
  scale_color_manual(values = group_pal) +
  theme_bw() +
  theme(text = element_text(size = 10),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(),  
        axis.title.x = element_blank(), 
        legend.position = 'none', 
        strip.background = element_blank()) +
    scale_y_continuous(labels = label_number(suffix = "K", scale = 1e-3)) +
  ylab('Norm Counts') +
  ggtitle(id)

p %>% print 

pdf(paste0(params$out_path, sample_prefix, '.', atlas_prefix, '.', id_plot, '.genes_dotplot.pdf'))
p %>% print 
dev.off()

mat = counts %>% 
  filter(ENS %in% genes) %>%
  select(-ENS) %>%
  column_to_rownames('SYM') %>%
  t() 

mat = t(scale(mat))

col_fun = circlize::colorRamp2(c(range(mat)[1], 0, range(mat)[2]), c("#235789", "#F2EFC7", "#BC412B")) 

mat = mat[,meta$Nickname, drop=F]

ha = columnAnnotation(cond = meta$Group, col = list(cond = group_pal))

h = Heatmap(mat,
        name = 'RNA', 
        bottom_annotation = ha, 
        row_title = str_replace(params$atlas_prefix, '_', ' '),
        col = col_fun, 
        width = ncol(mat)*unit(4, "mm"),
        height = nrow(mat)*unit(4, "mm"),
        rect_gp = gpar(color = 'black'), 
        show_row_dend = F)

draw(h, heatmap_legend_side = "left", annotation_legend_side = 'left')

pdf(paste0(params$out_path, sample_prefix, '.', atlas_prefix, '.', id_plot, '.heatmap.pdf'), height = 10)
draw(h, heatmap_legend_side = "left", annotation_legend_side = 'left')
dev.off()
}

plot_geneset_clin = function(id, group_pal = c('EZHIP_neg' = 'blue', 'H3K27me3_Low' = 'deepskyblue', 'EZHIP_pos' = 'red')){

id_plot = str_replace_all(id, ' ', '_')
id_plot = str_replace_all(id_plot, '[.]', '_')
id_plot = str_replace_all(id_plot, '[/]', '_')

p = full_join(
ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
theta %>%
  rownames_to_column('ID') %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'Deconvolution') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
by = c('sample', 'ID', 'Group')
) %>%
    select(ID, sample, Group, ssGSEA, Deconvolution) %>%
  filter(ID == id) %>%
  pivot_longer(-c(ID, sample, Group)) %>%
  mutate(name = factor(name, levels = c('ssGSEA', 'Deconvolution')),
         Group = factor(Group, levels = names(group_pal))) %>% 
ggplot(aes(Group, value, color = Group)) +
  geom_point(size = 5, alpha = 0.8) +
  scale_color_manual(values = group_pal) +
  theme_bw() +
  theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(),  
        axis.title = element_blank(), 
        legend.position = 'none', 
        strip.background = element_blank()) +
  facet_wrap(.~name, scales = 'free_y') +
  ggtitle(id)

p %>% print 

pdf(paste0(params$out_path, sample_prefix, '.', atlas_prefix, '.', id_plot, '.dotplot.pdf'))
p %>% print
dev.off()

genes = (deg %>% 
  filter(ENS %in% unlist(atlas[['hg_ens']][id])) %>%
  filter(log2FoldChange > 0.5))$ENS

p = counts %>% 
  filter(ENS %in% genes) %>%
  select(-ENS) %>%
  pivot_longer(-SYM) %>%
  left_join(meta, by = c('name' = 'Nickname')) %>%
  mutate(Group = factor(Group, levels = names(group_pal))) %>%
  ggplot(aes(Group, value, color = Group)) +
  geom_point(size = 2, alpha = 0.8) +
  facet_wrap(.~SYM, scales = 'free_y') +
  scale_color_manual(values = group_pal) +
  theme_bw() +
  theme(text = element_text(size = 10),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(),  
        axis.title.x = element_blank(), 
        legend.position = 'none', 
        strip.background = element_blank()) +
    scale_y_continuous(labels = label_number(suffix = "K", scale = 1e-3)) +
  ylab('Norm Counts') +
  ggtitle(id)

p %>% print 

pdf(paste0(params$out_path, sample_prefix, '.', atlas_prefix, '.', id_plot, '.genes_dotplot.pdf'))
p %>% print()
dev.off()

mat = counts %>% 
  filter(ENS %in% genes) %>%
  select(-ENS) %>%
  column_to_rownames('SYM') %>%
  t() 

mat = t(scale(mat))

col_fun = circlize::colorRamp2(c(range(mat)[1], 0, range(mat)[2]), c("#235789", "#F2EFC7", "#BC412B")) 

mat = mat[,meta$Nickname, drop=F]

colnames(mat) == meta$Nickname

ha = columnAnnotation(cond = meta$Group, col = list(cond = group_pal))

h = Heatmap(mat,
        name = 'RNA', 
        bottom_annotation = ha, 
        row_title = str_replace(params$atlas_prefix, '_', ' '),
        col = col_fun, 
        width = ncol(mat)*unit(4, "mm"),
        height = nrow(mat)*unit(4, "mm"),
        rect_gp = gpar(color = 'black'), 
        show_row_dend = F)

draw(h, heatmap_legend_side = "left", annotation_legend_side = 'left')

pdf(paste0(params$out_path, sample_prefix, '.', atlas_prefix, '.', id_plot, '.heatmap.pdf'), height = 10)
draw(h, heatmap_legend_side = "left", annotation_legend_side = 'left')
dev.off()
}

1 hMSC

sample_prefix = 'hMSC'
raptor_path = "/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/hMSC/out/01_raptor_exprtools/EZHIP/"
group_pal = c('blue', 'red')
names(group_pal) = data.table::fread(paste0(raptor_path, "info.groups.tsv"))$Group
meta = data.table::fread(paste0(raptor_path, "info.samples.tsv" )) 

counts = data.table::fread(paste0(raptor_path, "/counts/Ensembl.ensGene.exon.norm.tsv.gz")) %>%
  separate(V1, sep = ':', into = c('ENS', 'SYM'))
## Warning in data.table::fread(paste0(raptor_path,
## "/counts/Ensembl.ensGene.exon.norm.tsv.gz")): Detected 6 column names but the
## data has 7 columns (i.e. invalid file). Added 1 extra default column name for
## the first column which is guessed to be row names or an index. Use setnames()
## afterwards if this guess is not correct, or fix the file write command that
## created the file to create a valid file.
deg = data.table::fread(paste0(raptor_path, "/diff/Ensembl.ensGene.exon/WTvsEZHIP.tsv"))%>%
  separate(ID, sep = ':', into = c('ENS', 'SYM'))

1.1 Baccin

atlas_prefix = 'baccin'
atlas_path = '/lustre06/project/6004736/alvann/from_narval/REFERENCES/Bone_Marrow_Baccin/out/01_cluster_markers/bone_atlas_baccin_gene_sets.n100.Rda'

ssgsea = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/hMSC/out/04_ssGSEA/baccin_n100.ssgsea.txt'
deconv = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/hMSC/out/03_deconvolution/Skeletal_Development_Baccin_mm/bp.res.Rda'
loadRData <- function(fileName){
#loads an RData file, and returns it
    load(fileName)
    get(ls()[ls() != "fileName"])
}
atlas = loadRData(atlas_path)
load(deconv)
ssgsea = data.table::fread(ssgsea)
ssgsea
theta = get.fraction(bp=bp.res,
            which.theta="final",
            state.or.type="type") * 100

theta = theta %>% t %>% as.data.frame() 
theta
tmp = ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname'))

# calculate p-value 
stat = as.data.frame(t((rbind(by(tmp, tmp$ID, function(ssGSEA) { t.test(ssGSEA~Group, data=ssGSEA, paired=FALSE)$p.value })))))

names(stat) = 'p_val'

# adjust p-value 
stat = stat %>% 
  rownames_to_column('ID') %>%
  mutate(adj_p_val = p.adjust(p_val, method = 'fdr')) %>%
  arrange(adj_p_val)

# add log2 FC 
stat = ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')) %>%
  group_by(ID, Group) %>%
  mutate(mean = mean(ssGSEA)) %>%
  ungroup() %>%
  distinct(ID, mean, Group) %>%
  pivot_wider(names_from = Group, values_from = mean) %>%
  mutate(log2_FC = (.data[[names(group_pal)[2]]]) - (.data[[names(group_pal)[1]]])) %>%
  arrange(desc(log2_FC)) %>%
  select(ID, log2_FC) %>%
  full_join(stat, by = 'ID')

stat
order = (stat %>% arrange(log2_FC))$ID

p1 = full_join(
ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
theta %>%
  rownames_to_column('ID') %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'Deconvolution') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
by = c('sample', 'ID', 'Group')
)  %>%
  mutate(ID = factor(ID, levels = order)) %>% 
  pivot_longer(-c(ID, sample, Group)) %>%
  mutate(name = factor(name, levels = c('ssGSEA', 'Deconvolution'))) %>% 
ggplot(aes(ID, value, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  scale_color_manual(values = group_pal) +
  theme_bw() +
  theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
        axis.title = element_blank(), 
        legend.position = 'none', 
        strip.background = element_blank(), 
        panel.grid.major.y = element_line(color = "lightgrey",
                                          size = 0.3,
                                          linetype = 2)) +
  facet_wrap(.~name, scales = 'free_x') +
  coord_flip()
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
p1

pdf(paste0(params$out_path, sample_prefix, '.', atlas_prefix, '.pdf'))
p1
dev.off()
## png 
##   2
id = "Smooth muscle"
plot_geneset(id)

## png 
##   2

1.2 Baryawno

atlas_prefix = 'baryawno'
atlas_path = '/lustre06/project/6004736/alvann/from_narval/REFERENCES/Skeletal_Development_Baryawno_mm/out/01_gene_sets/develop_bone_atlas_gene_sets.n100.Rda'

ssgsea = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/hMSC/out/04_ssGSEA/baryawno_n100.ssgsea.txt'
deconv = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/hMSC/out/03_deconvolution/Skeletal_Development_Baryawno_mm/bp.res.Rda'
loadRData <- function(fileName){
#loads an RData file, and returns it
    load(fileName)
    get(ls()[ls() != "fileName"])
}
atlas = loadRData(atlas_path)
load(deconv)
ssgsea = data.table::fread(ssgsea)
ssgsea
theta = get.fraction(bp=bp.res,
            which.theta="final",
            state.or.type="type") * 100

theta = theta %>% t %>% as.data.frame() 
theta
tmp = ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname'))

# calculate p-value 
stat = as.data.frame(t((rbind(by(tmp, tmp$ID, function(ssGSEA) { t.test(ssGSEA~Group, data=ssGSEA, paired=FALSE)$p.value })))))

names(stat) = 'p_val'

# adjust p-value 
stat = stat %>% 
  rownames_to_column('ID') %>%
  mutate(adj_p_val = p.adjust(p_val, method = 'fdr')) %>%
  arrange(adj_p_val)

# add log2 FC 
stat = ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')) %>%
  group_by(ID, Group) %>%
  mutate(mean = mean(ssGSEA)) %>%
  ungroup() %>%
  distinct(ID, mean, Group) %>%
  pivot_wider(names_from = Group, values_from = mean) %>%
  mutate(log2_FC = (.data[[names(group_pal)[2]]]) - (.data[[names(group_pal)[1]]])) %>%
  arrange(desc(log2_FC)) %>%
  select(ID, log2_FC) %>%
  full_join(stat, by = 'ID')

stat
order = (stat %>% arrange(log2_FC))$ID

p1 = full_join(
ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
theta %>%
  rownames_to_column('ID') %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'Deconvolution') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
by = c('sample', 'ID', 'Group')
)  %>%
  mutate(ID = factor(ID, levels = order)) %>% 
  pivot_longer(-c(ID, sample, Group)) %>%
  mutate(name = factor(name, levels = c('ssGSEA', 'Deconvolution'))) %>% 
ggplot(aes(ID, value, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  scale_color_manual(values = group_pal) +
  theme_bw() +
  theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
        axis.title = element_blank(), 
        legend.position = 'none', 
        strip.background = element_blank(), 
        panel.grid.major.y = element_line(color = "lightgrey",
                                          size = 0.3,
                                          linetype = 2)) +
  facet_wrap(.~name, scales = 'free_x') +
  coord_flip()

p1
## Warning: Removed 24 rows containing missing values (`geom_point()`).

pdf(paste0(params$out_path, sample_prefix, '.', atlas_prefix, '.pdf'))
p1
## Warning: Removed 24 rows containing missing values (`geom_point()`).
dev.off()
## png 
##   2
id = "Pericytes"
plot_geneset(id)
## Warning: Removed 6 rows containing missing values (`geom_point()`).
## Removed 6 rows containing missing values (`geom_point()`).

## png 
##   2

1.3 DeMicheli

atlas_prefix = 'demicheli'
atlas_path = '/lustre06/project/6004736/alvann/from_narval/REFERENCES/Skeletal_Muscle_Tissue_hg/out/02_gene_sets/skeletal_muscle_atlas_gene_sets.n100.Rda'

ssgsea = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/hMSC/out/04_ssGSEA/demicheli_n100.ssgsea.txt'
deconv = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/hMSC/out/03_deconvolution/Skeletal_Muscle_Tissue_hg/bp.res.Rda'
loadRData <- function(fileName){
#loads an RData file, and returns it
    load(fileName)
    get(ls()[ls() != "fileName"])
}
atlas = loadRData(atlas_path)
load(deconv)
ssgsea = data.table::fread(ssgsea)
ssgsea
theta = get.fraction(bp=bp.res,
            which.theta="final",
            state.or.type="type") * 100

theta = theta %>% t %>% as.data.frame() 
theta
tmp = ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname'))

# calculate p-value 
stat = as.data.frame(t((rbind(by(tmp, tmp$ID, function(ssGSEA) { t.test(ssGSEA~Group, data=ssGSEA, paired=FALSE)$p.value })))))

names(stat) = 'p_val'

# adjust p-value 
stat = stat %>% 
  rownames_to_column('ID') %>%
  mutate(adj_p_val = p.adjust(p_val, method = 'fdr')) %>%
  arrange(adj_p_val)

# add log2 FC 
stat = ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')) %>%
  group_by(ID, Group) %>%
  mutate(mean = mean(ssGSEA)) %>%
  ungroup() %>%
  distinct(ID, mean, Group) %>%
  pivot_wider(names_from = Group, values_from = mean) %>%
  mutate(log2_FC = (.data[[names(group_pal)[2]]]) - (.data[[names(group_pal)[1]]])) %>%
  arrange(desc(log2_FC)) %>%
  select(ID, log2_FC) %>%
  full_join(stat, by = 'ID')

stat
order = (stat %>% arrange(log2_FC))$ID

p1 = full_join(
ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
theta %>%
  rownames_to_column('ID') %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'Deconvolution') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
by = c('sample', 'ID', 'Group')
)  %>%
  mutate(ID = factor(ID, levels = order)) %>% 
  pivot_longer(-c(ID, sample, Group)) %>%
  mutate(name = factor(name, levels = c('ssGSEA', 'Deconvolution'))) %>% 
ggplot(aes(ID, value, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  scale_color_manual(values = group_pal) +
  theme_bw() +
  theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
        axis.title = element_blank(), 
        legend.position = 'none', 
        strip.background = element_blank(), 
        panel.grid.major.y = element_line(color = "lightgrey",
                                          size = 0.3,
                                          linetype = 2)) +
  facet_wrap(.~name, scales = 'free_x') +
  coord_flip()

p1

pdf(paste0(params$out_path, sample_prefix, '.', atlas_prefix, '.pdf'))
p1
dev.off()
## png 
##   2
id = "ACTA2+ MYH11+ MYL9+ Smooth muscle cells"
plot_geneset(id)

## png 
##   2

2 MG63 Cell Line

sample_prefix = 'MG63_cell'
raptor_path = "/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/MG63/out/01_raptor_exprtools/EZHIP/"
group_pal = c('blue', 'red')
names(group_pal) = data.table::fread(paste0(raptor_path, "info.groups.tsv"))$Group
meta = data.table::fread(paste0(raptor_path, "info.samples.tsv" )) 

counts = data.table::fread(paste0(raptor_path, "/counts/Ensembl.ensGene.exon.norm.tsv.gz")) %>%
  separate(V1, sep = ':', into = c('ENS', 'SYM'))
## Warning in data.table::fread(paste0(raptor_path,
## "/counts/Ensembl.ensGene.exon.norm.tsv.gz")): Detected 6 column names but the
## data has 7 columns (i.e. invalid file). Added 1 extra default column name for
## the first column which is guessed to be row names or an index. Use setnames()
## afterwards if this guess is not correct, or fix the file write command that
## created the file to create a valid file.
deg = data.table::fread(paste0(raptor_path, "/diff/Ensembl.ensGene.exon/WTvsEZHIP.tsv"))%>%
  separate(ID, sep = ':', into = c('ENS', 'SYM'))

2.1 Baccin

atlas_prefix = 'baccin'
atlas_path = '/lustre06/project/6004736/alvann/from_narval/REFERENCES/Bone_Marrow_Baccin/out/01_cluster_markers/bone_atlas_baccin_gene_sets.n100.Rda'

ssgsea = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/MG63/out/04_ssGSEA/baccin_n100.ssgsea.txt'
deconv = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/MG63/out/03_deconvolution/Skeletal_Development_Baccin_mm/bp.res.Rda'
loadRData <- function(fileName){
#loads an RData file, and returns it
    load(fileName)
    get(ls()[ls() != "fileName"])
}
atlas = loadRData(atlas_path)
load(deconv)
ssgsea = data.table::fread(ssgsea)
ssgsea
theta = get.fraction(bp=bp.res,
            which.theta="final",
            state.or.type="type") * 100

theta = theta %>% t %>% as.data.frame() 
theta
tmp = ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname'))

# calculate p-value 
stat = as.data.frame(t((rbind(by(tmp, tmp$ID, function(ssGSEA) { t.test(ssGSEA~Group, data=ssGSEA, paired=FALSE)$p.value })))))

names(stat) = 'p_val'

# adjust p-value 
stat = stat %>% 
  rownames_to_column('ID') %>%
  mutate(adj_p_val = p.adjust(p_val, method = 'fdr')) %>%
  arrange(adj_p_val)

# add log2 FC 
stat = ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')) %>%
  group_by(ID, Group) %>%
  mutate(mean = mean(ssGSEA)) %>%
  ungroup() %>%
  distinct(ID, mean, Group) %>%
  pivot_wider(names_from = Group, values_from = mean) %>%
  mutate(log2_FC = (.data[[names(group_pal)[2]]]) - (.data[[names(group_pal)[1]]])) %>%
  arrange(desc(log2_FC)) %>%
  select(ID, log2_FC) %>%
  full_join(stat, by = 'ID')

stat
order = (stat %>% arrange(log2_FC))$ID

p1 = full_join(
ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
theta %>%
  rownames_to_column('ID') %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'Deconvolution') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
by = c('sample', 'ID', 'Group')
)  %>%
  mutate(ID = factor(ID, levels = order)) %>% 
  pivot_longer(-c(ID, sample, Group)) %>%
  mutate(name = factor(name, levels = c('ssGSEA', 'Deconvolution'))) %>% 
ggplot(aes(ID, value, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  scale_color_manual(values = group_pal) +
  theme_bw() +
  theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
        axis.title = element_blank(), 
        legend.position = 'none', 
        strip.background = element_blank(), 
        panel.grid.major.y = element_line(color = "lightgrey",
                                          size = 0.3,
                                          linetype = 2)) +
  facet_wrap(.~name, scales = 'free_x') +
  coord_flip()

p1

pdf(paste0(params$out_path, sample_prefix, '.', atlas_prefix, '.pdf'))
p1
dev.off()
## png 
##   2
id = "Smooth muscle"
plot_geneset(id)

## png 
##   2
id = "Osteo-CAR"
plot_geneset(id)

## png 
##   2
id = "Adipo-CAR"
plot_geneset(id)

## png 
##   2

2.2 Baryawno

atlas_prefix = 'baryawno'
atlas_path = '/lustre06/project/6004736/alvann/from_narval/REFERENCES/Skeletal_Development_Baryawno_mm/out/01_gene_sets/develop_bone_atlas_gene_sets.n100.Rda'

ssgsea = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/MG63/out/04_ssGSEA/baryawno_n100.ssgsea.txt'
deconv = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/MG63/out/03_deconvolution/Skeletal_Development_Baryawno_mm/bp.res.Rda'
loadRData <- function(fileName){
#loads an RData file, and returns it
    load(fileName)
    get(ls()[ls() != "fileName"])
}
atlas = loadRData(atlas_path)
load(deconv)
ssgsea = data.table::fread(ssgsea)
ssgsea
theta = get.fraction(bp=bp.res,
            which.theta="final",
            state.or.type="type") * 100

theta = theta %>% t %>% as.data.frame() 
theta
tmp = ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname'))

# calculate p-value 
stat = as.data.frame(t((rbind(by(tmp, tmp$ID, function(ssGSEA) { t.test(ssGSEA~Group, data=ssGSEA, paired=FALSE)$p.value })))))

names(stat) = 'p_val'

# adjust p-value 
stat = stat %>% 
  rownames_to_column('ID') %>%
  mutate(adj_p_val = p.adjust(p_val, method = 'fdr')) %>%
  arrange(adj_p_val)

# add log2 FC 
stat = ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')) %>%
  group_by(ID, Group) %>%
  mutate(mean = mean(ssGSEA)) %>%
  ungroup() %>%
  distinct(ID, mean, Group) %>%
  pivot_wider(names_from = Group, values_from = mean) %>%
  mutate(log2_FC = (.data[[names(group_pal)[2]]]) - (.data[[names(group_pal)[1]]])) %>%
  arrange(desc(log2_FC)) %>%
  select(ID, log2_FC) %>%
  full_join(stat, by = 'ID')

stat
order = (stat %>% arrange(log2_FC))$ID

p1 = full_join(
ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
theta %>%
  rownames_to_column('ID') %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'Deconvolution') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
by = c('sample', 'ID', 'Group')
)  %>%
  mutate(ID = factor(ID, levels = order)) %>% 
  pivot_longer(-c(ID, sample, Group)) %>%
  mutate(name = factor(name, levels = c('ssGSEA', 'Deconvolution'))) %>% 
ggplot(aes(ID, value, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  scale_color_manual(values = group_pal) +
  theme_bw() +
  theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
        axis.title = element_blank(), 
        legend.position = 'none', 
        strip.background = element_blank(), 
        panel.grid.major.y = element_line(color = "lightgrey",
                                          size = 0.3,
                                          linetype = 2)) +
  facet_wrap(.~name, scales = 'free_x') +
  coord_flip()

p1
## Warning: Removed 24 rows containing missing values (`geom_point()`).

pdf(paste0(params$out_path, sample_prefix, '.', atlas_prefix, '.pdf'))
p1
## Warning: Removed 24 rows containing missing values (`geom_point()`).
dev.off()
## png 
##   2
id = "Pericytes"
plot_geneset(id)
## Warning: Removed 6 rows containing missing values (`geom_point()`).
## Removed 6 rows containing missing values (`geom_point()`).

## png 
##   2

2.3 DeMicheli

atlas_prefix = 'demicheli'
atlas_path = '/lustre06/project/6004736/alvann/from_narval/REFERENCES/Skeletal_Muscle_Tissue_hg/out/02_gene_sets/skeletal_muscle_atlas_gene_sets.n100.Rda'

ssgsea = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/MG63/out/04_ssGSEA/demicheli_n100.ssgsea.txt'
deconv = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/MG63/out/03_deconvolution/Skeletal_Muscle_Tissue_hg/bp.res.Rda'
loadRData <- function(fileName){
#loads an RData file, and returns it
    load(fileName)
    get(ls()[ls() != "fileName"])
}
atlas = loadRData(atlas_path)
load(deconv)
ssgsea = data.table::fread(ssgsea)
ssgsea
theta = get.fraction(bp=bp.res,
            which.theta="final",
            state.or.type="type") * 100

theta = theta %>% t %>% as.data.frame() 
theta
tmp = ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname'))

# calculate p-value 
stat = as.data.frame(t((rbind(by(tmp, tmp$ID, function(ssGSEA) { t.test(ssGSEA~Group, data=ssGSEA, paired=FALSE)$p.value })))))

names(stat) = 'p_val'

# adjust p-value 
stat = stat %>% 
  rownames_to_column('ID') %>%
  mutate(adj_p_val = p.adjust(p_val, method = 'fdr')) %>%
  arrange(adj_p_val)

# add log2 FC 
stat = ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')) %>%
  group_by(ID, Group) %>%
  mutate(mean = mean(ssGSEA)) %>%
  ungroup() %>%
  distinct(ID, mean, Group) %>%
  pivot_wider(names_from = Group, values_from = mean) %>%
  mutate(log2_FC = (.data[[names(group_pal)[2]]]) - (.data[[names(group_pal)[1]]])) %>%
  arrange(desc(log2_FC)) %>%
  select(ID, log2_FC) %>%
  full_join(stat, by = 'ID')

stat
order = (stat %>% arrange(log2_FC))$ID

p1 = full_join(
ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
theta %>%
  rownames_to_column('ID') %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'Deconvolution') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
by = c('sample', 'ID', 'Group')
)  %>%
  mutate(ID = factor(ID, levels = order)) %>% 
  pivot_longer(-c(ID, sample, Group)) %>%
  mutate(name = factor(name, levels = c('ssGSEA', 'Deconvolution'))) %>% 
ggplot(aes(ID, value, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  scale_color_manual(values = group_pal) +
  theme_bw() +
  theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
        axis.title = element_blank(), 
        legend.position = 'none', 
        strip.background = element_blank(), 
        panel.grid.major.y = element_line(color = "lightgrey",
                                          size = 0.3,
                                          linetype = 2)) +
  facet_wrap(.~name, scales = 'free_x') +
  coord_flip()

p1

pdf(paste0(params$out_path, sample_prefix, '.', atlas_prefix, '.pdf'))
p1
dev.off()
## png 
##   2
id = "ACTA2+ MYH11+ MYL9+ Smooth muscle cells"
plot_geneset(id)

## png 
##   2

3 KHOS Cell Line

sample_prefix = 'KHOS_cell'
raptor_path = "/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/KHOS/out/01_raptor_exprtools/EZHIP/"
group_pal = c('blue', 'red')
names(group_pal) = data.table::fread(paste0(raptor_path, "info.groups.tsv"))$Group
meta = data.table::fread(paste0(raptor_path, "info.samples.tsv" )) 

counts = data.table::fread(paste0(raptor_path, "/counts/Ensembl.ensGene.exon.norm.tsv.gz")) %>%
  separate(V1, sep = ':', into = c('ENS', 'SYM'))
## Warning in data.table::fread(paste0(raptor_path,
## "/counts/Ensembl.ensGene.exon.norm.tsv.gz")): Detected 6 column names but the
## data has 7 columns (i.e. invalid file). Added 1 extra default column name for
## the first column which is guessed to be row names or an index. Use setnames()
## afterwards if this guess is not correct, or fix the file write command that
## created the file to create a valid file.
deg = data.table::fread(paste0(raptor_path, "/diff/Ensembl.ensGene.exon/WTvsEZHIP.tsv"))%>%
  separate(ID, sep = ':', into = c('ENS', 'SYM'))

3.1 Baccin

atlas_prefix = 'baccin'
atlas_path = '/lustre06/project/6004736/alvann/from_narval/REFERENCES/Bone_Marrow_Baccin/out/01_cluster_markers/bone_atlas_baccin_gene_sets.n100.Rda'

ssgsea = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/KHOS/out/04_ssGSEA/baccin_n100.ssgsea.txt'
deconv = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/KHOS/out/03_deconvolution/Skeletal_Development_Baccin_mm/bp.res.Rda'
loadRData <- function(fileName){
#loads an RData file, and returns it
    load(fileName)
    get(ls()[ls() != "fileName"])
}
atlas = loadRData(atlas_path)
load(deconv)
ssgsea = data.table::fread(ssgsea)
ssgsea
theta = get.fraction(bp=bp.res,
            which.theta="final",
            state.or.type="type") * 100

theta = theta %>% t %>% as.data.frame() 
theta
tmp = ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname'))

# calculate p-value 
stat = as.data.frame(t((rbind(by(tmp, tmp$ID, function(ssGSEA) { t.test(ssGSEA~Group, data=ssGSEA, paired=FALSE)$p.value })))))

names(stat) = 'p_val'

# adjust p-value 
stat = stat %>% 
  rownames_to_column('ID') %>%
  mutate(adj_p_val = p.adjust(p_val, method = 'fdr')) %>%
  arrange(adj_p_val)

# add log2 FC 
stat = ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')) %>%
  group_by(ID, Group) %>%
  mutate(mean = mean(ssGSEA)) %>%
  ungroup() %>%
  distinct(ID, mean, Group) %>%
  pivot_wider(names_from = Group, values_from = mean) %>%
  mutate(log2_FC = (.data[[names(group_pal)[2]]]) - (.data[[names(group_pal)[1]]])) %>%
  arrange(desc(log2_FC)) %>%
  select(ID, log2_FC) %>%
  full_join(stat, by = 'ID')

stat
order = (stat %>% arrange(log2_FC))$ID

p1 = full_join(
ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
theta %>%
  rownames_to_column('ID') %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'Deconvolution') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
by = c('sample', 'ID', 'Group')
)  %>%
  mutate(ID = factor(ID, levels = order)) %>% 
  pivot_longer(-c(ID, sample, Group)) %>%
  mutate(name = factor(name, levels = c('ssGSEA', 'Deconvolution'))) %>% 
ggplot(aes(ID, value, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  scale_color_manual(values = group_pal) +
  theme_bw() +
  theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
        axis.title = element_blank(), 
        legend.position = 'none', 
        strip.background = element_blank(), 
        panel.grid.major.y = element_line(color = "lightgrey",
                                          size = 0.3,
                                          linetype = 2)) +
  facet_wrap(.~name, scales = 'free_x') +
  coord_flip()

p1

pdf(paste0(params$out_path, sample_prefix, '.', atlas_prefix, '.pdf'))
p1
dev.off()
## png 
##   2
id = "Osteo-CAR"
plot_geneset(id)

## png 
##   2
id = "Adipo-CAR"
plot_geneset(id)

## png 
##   2

3.2 Baryawno

atlas_prefix = 'baryawno'
atlas_path = '/lustre06/project/6004736/alvann/from_narval/REFERENCES/Skeletal_Development_Baryawno_mm/out/01_gene_sets/develop_bone_atlas_gene_sets.n100.Rda'

ssgsea = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/KHOS/out/04_ssGSEA/baryawno_n100.ssgsea.txt'
deconv = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/KHOS/out/03_deconvolution/Skeletal_Development_Baryawno_mm/bp.res.Rda'
loadRData <- function(fileName){
#loads an RData file, and returns it
    load(fileName)
    get(ls()[ls() != "fileName"])
}
atlas = loadRData(atlas_path)
load(deconv)
ssgsea = data.table::fread(ssgsea)
ssgsea
theta = get.fraction(bp=bp.res,
            which.theta="final",
            state.or.type="type") * 100

theta = theta %>% t %>% as.data.frame() 
theta
tmp = ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname'))

# calculate p-value 
stat = as.data.frame(t((rbind(by(tmp, tmp$ID, function(ssGSEA) { t.test(ssGSEA~Group, data=ssGSEA, paired=FALSE)$p.value })))))

names(stat) = 'p_val'

# adjust p-value 
stat = stat %>% 
  rownames_to_column('ID') %>%
  mutate(adj_p_val = p.adjust(p_val, method = 'fdr')) %>%
  arrange(adj_p_val)

# add log2 FC 
stat = ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')) %>%
  group_by(ID, Group) %>%
  mutate(mean = mean(ssGSEA)) %>%
  ungroup() %>%
  distinct(ID, mean, Group) %>%
  pivot_wider(names_from = Group, values_from = mean) %>%
  mutate(log2_FC = (.data[[names(group_pal)[2]]]) - (.data[[names(group_pal)[1]]])) %>%
  arrange(desc(log2_FC)) %>%
  select(ID, log2_FC) %>%
  full_join(stat, by = 'ID')

stat
order = (stat %>% arrange(log2_FC))$ID

p1 = full_join(
ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
theta %>%
  rownames_to_column('ID') %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'Deconvolution') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
by = c('sample', 'ID', 'Group')
)  %>%
  mutate(ID = factor(ID, levels = order)) %>% 
  pivot_longer(-c(ID, sample, Group)) %>%
  mutate(name = factor(name, levels = c('ssGSEA', 'Deconvolution'))) %>% 
ggplot(aes(ID, value, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  scale_color_manual(values = group_pal) +
  theme_bw() +
  theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
        axis.title = element_blank(), 
        legend.position = 'none', 
        strip.background = element_blank(), 
        panel.grid.major.y = element_line(color = "lightgrey",
                                          size = 0.3,
                                          linetype = 2)) +
  facet_wrap(.~name, scales = 'free_x') +
  coord_flip()

p1
## Warning: Removed 24 rows containing missing values (`geom_point()`).

pdf(paste0(params$out_path, sample_prefix, '.', atlas_prefix, '.pdf'))
p1
## Warning: Removed 24 rows containing missing values (`geom_point()`).
dev.off()
## png 
##   2
id = "Early_Osteoprogenitors"
plot_geneset(id)

## png 
##   2

3.3 DeMicheli

atlas_prefix = 'demicheli'
atlas_path = '/lustre06/project/6004736/alvann/from_narval/REFERENCES/Skeletal_Muscle_Tissue_hg/out/02_gene_sets/skeletal_muscle_atlas_gene_sets.n100.Rda'

ssgsea = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/KHOS/out/04_ssGSEA/demicheli_n100.ssgsea.txt'
deconv = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/KHOS/out/03_deconvolution/Skeletal_Muscle_Tissue_hg/bp.res.Rda'
loadRData <- function(fileName){
#loads an RData file, and returns it
    load(fileName)
    get(ls()[ls() != "fileName"])
}
atlas = loadRData(atlas_path)
load(deconv)
ssgsea = data.table::fread(ssgsea)
ssgsea
theta = get.fraction(bp=bp.res,
            which.theta="final",
            state.or.type="type") * 100

theta = theta %>% t %>% as.data.frame() 
theta
tmp = ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname'))

# calculate p-value 
stat = as.data.frame(t((rbind(by(tmp, tmp$ID, function(ssGSEA) { t.test(ssGSEA~Group, data=ssGSEA, paired=FALSE)$p.value })))))

names(stat) = 'p_val'

# adjust p-value 
stat = stat %>% 
  rownames_to_column('ID') %>%
  mutate(adj_p_val = p.adjust(p_val, method = 'fdr')) %>%
  arrange(adj_p_val)

# add log2 FC 
stat = ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')) %>%
  group_by(ID, Group) %>%
  mutate(mean = mean(ssGSEA)) %>%
  ungroup() %>%
  distinct(ID, mean, Group) %>%
  pivot_wider(names_from = Group, values_from = mean) %>%
  mutate(log2_FC = (.data[[names(group_pal)[2]]]) - (.data[[names(group_pal)[1]]])) %>%
  arrange(desc(log2_FC)) %>%
  select(ID, log2_FC) %>%
  full_join(stat, by = 'ID')

stat
order = (stat %>% arrange(log2_FC))$ID

p1 = full_join(
ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
theta %>%
  rownames_to_column('ID') %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'Deconvolution') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
by = c('sample', 'ID', 'Group')
)  %>%
  mutate(ID = factor(ID, levels = order)) %>% 
  pivot_longer(-c(ID, sample, Group)) %>%
  mutate(name = factor(name, levels = c('ssGSEA', 'Deconvolution'))) %>% 
ggplot(aes(ID, value, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  scale_color_manual(values = group_pal) +
  theme_bw() +
  theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
        axis.title = element_blank(), 
        legend.position = 'none', 
        strip.background = element_blank(), 
        panel.grid.major.y = element_line(color = "lightgrey",
                                          size = 0.3,
                                          linetype = 2)) +
  facet_wrap(.~name, scales = 'free_x') +
  coord_flip()

p1

pdf(paste0(params$out_path, sample_prefix, '.', atlas_prefix, '.pdf'))
p1
dev.off()
## png 
##   2
id = "PAX7+ DLK1+ MuSCs and progenitors"
plot_geneset(id)

## png 
##   2

4 U2OS

sample_prefix = 'U2OS'
raptor_path = "/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/U2OS/out/01_raptor_exprtools/EZHIP_minus_C77_C2/"
group_pal = c('blue', 'red')
names(group_pal) = data.table::fread(paste0(raptor_path, "info.groups.tsv"))$Group
meta = data.table::fread(paste0(raptor_path, "info.samples.tsv" )) 

counts = data.table::fread(paste0(raptor_path, "/counts/Ensembl.ensGene.exon.norm.tsv.gz")) %>%
  separate(V1, sep = ':', into = c('ENS', 'SYM'))
## Warning in data.table::fread(paste0(raptor_path,
## "/counts/Ensembl.ensGene.exon.norm.tsv.gz")): Detected 6 column names but the
## data has 7 columns (i.e. invalid file). Added 1 extra default column name for
## the first column which is guessed to be row names or an index. Use setnames()
## afterwards if this guess is not correct, or fix the file write command that
## created the file to create a valid file.
deg = data.table::fread(paste0(raptor_path, "/diff/Ensembl.ensGene.exon/EZHIP_KOvsEZHIP.tsv"))%>%
  separate(ID, sep = ':', into = c('ENS', 'SYM'))

4.1 Baccin

atlas_prefix = 'baccin'
atlas_path = '/lustre06/project/6004736/alvann/from_narval/REFERENCES/Bone_Marrow_Baccin/out/01_cluster_markers/bone_atlas_baccin_gene_sets.n100.Rda'

ssgsea = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/U2OS/out/04_ssGSEA/baccin_n100.ssgsea.txt'
deconv = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/U2OS/out/03_deconvolution/Skeletal_Development_Baccin_mm/bp.res.Rda'
loadRData <- function(fileName){
#loads an RData file, and returns it
    load(fileName)
    get(ls()[ls() != "fileName"])
}
atlas = loadRData(atlas_path)
load(deconv)
ssgsea = data.table::fread(ssgsea)
ssgsea
theta = get.fraction(bp=bp.res,
            which.theta="final",
            state.or.type="type") * 100

theta = theta %>% t %>% as.data.frame() 
theta
tmp = ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname'))

# calculate p-value 
stat = as.data.frame(t((rbind(by(tmp, tmp$ID, function(ssGSEA) { t.test(ssGSEA~Group, data=ssGSEA, paired=FALSE)$p.value })))))

names(stat) = 'p_val'

# adjust p-value 
stat = stat %>% 
  rownames_to_column('ID') %>%
  mutate(adj_p_val = p.adjust(p_val, method = 'fdr')) %>%
  arrange(adj_p_val)

# add log2 FC 
stat = ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')) %>%
  group_by(ID, Group) %>%
  mutate(mean = mean(ssGSEA)) %>%
  ungroup() %>%
  distinct(ID, mean, Group) %>%
  pivot_wider(names_from = Group, values_from = mean) %>%
  mutate(log2_FC = (.data[[names(group_pal)[2]]]) - (.data[[names(group_pal)[1]]])) %>%
  arrange(desc(log2_FC)) %>%
  select(ID, log2_FC) %>%
  full_join(stat, by = 'ID')

stat
order = (stat %>% arrange(log2_FC))$ID

p1 = full_join(
ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
theta %>%
  rownames_to_column('ID') %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'Deconvolution') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
by = c('sample', 'ID', 'Group')
)  %>%
  mutate(ID = factor(ID, levels = order)) %>% 
  pivot_longer(-c(ID, sample, Group)) %>%
  mutate(name = factor(name, levels = c('ssGSEA', 'Deconvolution'))) %>% 
ggplot(aes(ID, value, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  scale_color_manual(values = group_pal) +
  theme_bw() +
  theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
        axis.title = element_blank(), 
        legend.position = 'none', 
        strip.background = element_blank(), 
        panel.grid.major.y = element_line(color = "lightgrey",
                                          size = 0.3,
                                          linetype = 2)) +
  facet_wrap(.~name, scales = 'free_x') +
  coord_flip()

p1

pdf(paste0(params$out_path, sample_prefix, '.', atlas_prefix, '.pdf'))
p1
dev.off()
## png 
##   2
id = "Smooth muscle"
plot_geneset(id, group_pal = c('EZHIP_KO' = 'blue', 'EZHIP' = 'red'))

## png 
##   2
id = "Osteo-CAR"
plot_geneset(id, group_pal = c('EZHIP_KO' = 'blue', 'EZHIP' = 'red'))

## png 
##   2
id = "Adipo-CAR"
plot_geneset(id, group_pal = c('EZHIP_KO' = 'blue', 'EZHIP' = 'red'))

## png 
##   2
id = "Ng2+ MSCs"
plot_geneset(id, group_pal = c('EZHIP_KO' = 'blue', 'EZHIP' = 'red'))

## png 
##   2

4.2 Baryawno

atlas_prefix = 'baryawno'
atlas_path = '/lustre06/project/6004736/alvann/from_narval/REFERENCES/Skeletal_Development_Baryawno_mm/out/01_gene_sets/develop_bone_atlas_gene_sets.n100.Rda'

ssgsea = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/U2OS/out/04_ssGSEA/baryawno_n100.ssgsea.txt'
deconv = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/U2OS/out/03_deconvolution/Skeletal_Development_Baryawno_mm/bp.res.Rda'
loadRData <- function(fileName){
#loads an RData file, and returns it
    load(fileName)
    get(ls()[ls() != "fileName"])
}
atlas = loadRData(atlas_path)
load(deconv)
ssgsea = data.table::fread(ssgsea)
ssgsea
theta = get.fraction(bp=bp.res,
            which.theta="final",
            state.or.type="type") * 100

theta = theta %>% t %>% as.data.frame() 
theta
tmp = ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname'))

# calculate p-value 
stat = as.data.frame(t((rbind(by(tmp, tmp$ID, function(ssGSEA) { t.test(ssGSEA~Group, data=ssGSEA, paired=FALSE)$p.value })))))

names(stat) = 'p_val'

# adjust p-value 
stat = stat %>% 
  rownames_to_column('ID') %>%
  mutate(adj_p_val = p.adjust(p_val, method = 'fdr')) %>%
  arrange(adj_p_val)

# add log2 FC 
stat = ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')) %>%
  group_by(ID, Group) %>%
  mutate(mean = mean(ssGSEA)) %>%
  ungroup() %>%
  distinct(ID, mean, Group) %>%
  pivot_wider(names_from = Group, values_from = mean) %>%
  mutate(log2_FC = (.data[[names(group_pal)[2]]]) - (.data[[names(group_pal)[1]]])) %>%
  arrange(desc(log2_FC)) %>%
  select(ID, log2_FC) %>%
  full_join(stat, by = 'ID')

stat
order = (stat %>% arrange(log2_FC))$ID

p1 = full_join(
ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
theta %>%
  rownames_to_column('ID') %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'Deconvolution') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
by = c('sample', 'ID', 'Group')
)  %>%
  mutate(ID = factor(ID, levels = order)) %>% 
  pivot_longer(-c(ID, sample, Group)) %>%
  mutate(name = factor(name, levels = c('ssGSEA', 'Deconvolution'))) %>% 
ggplot(aes(ID, value, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  scale_color_manual(values = group_pal) +
  theme_bw() +
  theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
        axis.title = element_blank(), 
        legend.position = 'none', 
        strip.background = element_blank(), 
        panel.grid.major.y = element_line(color = "lightgrey",
                                          size = 0.3,
                                          linetype = 2)) +
  facet_wrap(.~name, scales = 'free_x') +
  coord_flip()

p1
## Warning: Removed 24 rows containing missing values (`geom_point()`).

pdf(paste0(params$out_path, sample_prefix, '.', atlas_prefix, '.pdf'))
p1
## Warning: Removed 24 rows containing missing values (`geom_point()`).
dev.off()
## png 
##   2
id = "OLC_1"
plot_geneset(id,  group_pal = c('EZHIP_KO' = 'blue', 'EZHIP' = 'red'))
## Warning: Removed 6 rows containing missing values (`geom_point()`).
## Removed 6 rows containing missing values (`geom_point()`).

## png 
##   2

4.3 DeMicheli

atlas_prefix = 'demicheli'
atlas_path = '/lustre06/project/6004736/alvann/from_narval/REFERENCES/Skeletal_Muscle_Tissue_hg/out/02_gene_sets/skeletal_muscle_atlas_gene_sets.n100.Rda'

ssgsea = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/U2OS/out/04_ssGSEA/demicheli_n100.ssgsea.txt'
deconv = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/U2OS/out/03_deconvolution/Skeletal_Muscle_Tissue_hg/bp.res.Rda'
loadRData <- function(fileName){
#loads an RData file, and returns it
    load(fileName)
    get(ls()[ls() != "fileName"])
}
atlas = loadRData(atlas_path)
load(deconv)
ssgsea = data.table::fread(ssgsea)
ssgsea
theta = get.fraction(bp=bp.res,
            which.theta="final",
            state.or.type="type") * 100

theta = theta %>% t %>% as.data.frame() 
theta
tmp = ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname'))

# calculate p-value 
stat = as.data.frame(t((rbind(by(tmp, tmp$ID, function(ssGSEA) { t.test(ssGSEA~Group, data=ssGSEA, paired=FALSE)$p.value })))))

names(stat) = 'p_val'

# adjust p-value 
stat = stat %>% 
  rownames_to_column('ID') %>%
  mutate(adj_p_val = p.adjust(p_val, method = 'fdr')) %>%
  arrange(adj_p_val)

# add log2 FC 
stat = ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')) %>%
  group_by(ID, Group) %>%
  mutate(mean = mean(ssGSEA)) %>%
  ungroup() %>%
  distinct(ID, mean, Group) %>%
  pivot_wider(names_from = Group, values_from = mean) %>%
  mutate(log2_FC = (.data[[names(group_pal)[2]]]) - (.data[[names(group_pal)[1]]])) %>%
  arrange(desc(log2_FC)) %>%
  select(ID, log2_FC) %>%
  full_join(stat, by = 'ID')

stat
order = (stat %>% arrange(log2_FC))$ID

p1 = full_join(
ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
theta %>%
  rownames_to_column('ID') %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'Deconvolution') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
by = c('sample', 'ID', 'Group')
)  %>%
  mutate(ID = factor(ID, levels = order)) %>% 
  pivot_longer(-c(ID, sample, Group)) %>%
  mutate(name = factor(name, levels = c('ssGSEA', 'Deconvolution'))) %>% 
ggplot(aes(ID, value, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  scale_color_manual(values = group_pal) +
  theme_bw() +
  theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
        axis.title = element_blank(), 
        legend.position = 'none', 
        strip.background = element_blank(), 
        panel.grid.major.y = element_line(color = "lightgrey",
                                          size = 0.3,
                                          linetype = 2)) +
  facet_wrap(.~name, scales = 'free_x') +
  coord_flip()

p1

pdf(paste0(params$out_path, sample_prefix, '.', atlas_prefix, '.pdf'))
p1
dev.off()
## png 
##   2
id = "PAX7low MYF5+ MuSCs and progenitors"
plot_geneset(id,  group_pal = c('EZHIP_KO' = 'blue', 'EZHIP' = 'red'))

## png 
##   2
id = 'ACTA1+ Mature skeletal muscle'
plot_geneset(id,  group_pal = c('EZHIP_KO' = 'blue', 'EZHIP' = 'red'))

## png 
##   2

5 MG63 Xenograft

sample_prefix = 'MG63_Xgraft'
raptor_path = "/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/xgrafts/out/01_raptor_exprtools/MG63/"
group_pal = c('blue', 'red')
names(group_pal) = data.table::fread(paste0(raptor_path, "info.groups.tsv"))$Group
meta = data.table::fread(paste0(raptor_path, "info.samples.tsv" )) 

counts = data.table::fread(paste0(raptor_path, "/counts/hg19.Ensembl.ensGene.exon.norm.tsv.gz")) %>%
  separate(V1, sep = ':', into = c('ENS', 'SYM'))
## Warning in data.table::fread(paste0(raptor_path,
## "/counts/hg19.Ensembl.ensGene.exon.norm.tsv.gz")): Detected 5 column names but
## the data has 6 columns (i.e. invalid file). Added 1 extra default column name
## for the first column which is guessed to be row names or an index. Use
## setnames() afterwards if this guess is not correct, or fix the file write
## command that created the file to create a valid file.
deg = data.table::fread(paste0(raptor_path, "/diff/hg19.Ensembl.ensGene.exon/WTvsEZHIP.tsv"))%>%
  separate(ID, sep = ':', into = c('ENS', 'SYM'))

5.1 Baccin

atlas_prefix = 'baccin'
atlas_path = '/lustre06/project/6004736/alvann/from_narval/REFERENCES/Bone_Marrow_Baccin/out/01_cluster_markers/bone_atlas_baccin_gene_sets.n100.Rda'

ssgsea = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/xgrafts/out/04_ssGSEA/MG63/baccin_n100.ssgsea.txt'
deconv = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/xgrafts/out/05_deconvolution/Skeletal_Development_Baccin_mm/MG63/bp.res.Rda'
loadRData <- function(fileName){
#loads an RData file, and returns it
    load(fileName)
    get(ls()[ls() != "fileName"])
}
atlas = loadRData(atlas_path)
load(deconv)
ssgsea = data.table::fread(ssgsea)
ssgsea
theta = get.fraction(bp=bp.res,
            which.theta="final",
            state.or.type="type") * 100

theta = theta %>% t %>% as.data.frame() 
theta
tmp = ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname'))

# calculate p-value 
stat = as.data.frame(t((rbind(by(tmp, tmp$ID, function(ssGSEA) { t.test(ssGSEA~Group, data=ssGSEA, paired=FALSE)$p.value })))))

names(stat) = 'p_val'

# adjust p-value 
stat = stat %>% 
  rownames_to_column('ID') %>%
  mutate(adj_p_val = p.adjust(p_val, method = 'fdr')) %>%
  arrange(adj_p_val)

# add log2 FC 
stat = ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')) %>%
  group_by(ID, Group) %>%
  mutate(mean = mean(ssGSEA)) %>%
  ungroup() %>%
  distinct(ID, mean, Group) %>%
  pivot_wider(names_from = Group, values_from = mean) %>%
  mutate(log2_FC = (.data[[names(group_pal)[2]]]) - (.data[[names(group_pal)[1]]])) %>%
  arrange(desc(log2_FC)) %>%
  select(ID, log2_FC) %>%
  full_join(stat, by = 'ID')

stat
order = (stat %>% arrange(log2_FC))$ID

p1 = full_join(
ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
theta %>%
  rownames_to_column('ID') %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'Deconvolution') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
by = c('sample', 'ID', 'Group')
)  %>%
  mutate(ID = factor(ID, levels = order)) %>% 
  pivot_longer(-c(ID, sample, Group)) %>%
  mutate(name = factor(name, levels = c('ssGSEA', 'Deconvolution'))) %>% 
ggplot(aes(ID, value, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  scale_color_manual(values = group_pal) +
  theme_bw() +
  theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
        axis.title = element_blank(), 
        legend.position = 'none', 
        strip.background = element_blank(), 
        panel.grid.major.y = element_line(color = "lightgrey",
                                          size = 0.3,
                                          linetype = 2)) +
  facet_wrap(.~name, scales = 'free_x') +
  coord_flip()

p1

pdf(paste0(params$out_path, sample_prefix, '.', atlas_prefix, '.pdf'))
p1
dev.off()
## png 
##   2
id = "Adipo-CAR"
plot_geneset(id)

## png 
##   2
id = "Osteo-CAR"
plot_geneset(id)

## png 
##   2
id = "Chondrocytes"
plot_geneset(id)

## png 
##   2

5.2 Baryawno

atlas_prefix = 'baryawno'
atlas_path = '/lustre06/project/6004736/alvann/from_narval/REFERENCES/Skeletal_Development_Baryawno_mm/out/01_gene_sets/develop_bone_atlas_gene_sets.n100.Rda'

ssgsea = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/xgrafts/out/04_ssGSEA/MG63/baryawno_n100.ssgsea.txt'
deconv = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/xgrafts/out/05_deconvolution/Skeletal_Development_Baryawno_mm/MG63/bp.res.Rda'
loadRData <- function(fileName){
#loads an RData file, and returns it
    load(fileName)
    get(ls()[ls() != "fileName"])
}
atlas = loadRData(atlas_path)
load(deconv)
ssgsea = data.table::fread(ssgsea)
ssgsea
theta = get.fraction(bp=bp.res,
            which.theta="final",
            state.or.type="type") * 100

theta = theta %>% t %>% as.data.frame() 
theta
tmp = ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname'))

# calculate p-value 
stat = as.data.frame(t((rbind(by(tmp, tmp$ID, function(ssGSEA) { t.test(ssGSEA~Group, data=ssGSEA, paired=FALSE)$p.value })))))

names(stat) = 'p_val'

# adjust p-value 
stat = stat %>% 
  rownames_to_column('ID') %>%
  mutate(adj_p_val = p.adjust(p_val, method = 'fdr')) %>%
  arrange(adj_p_val)

# add log2 FC 
stat = ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')) %>%
  group_by(ID, Group) %>%
  mutate(mean = mean(ssGSEA)) %>%
  ungroup() %>%
  distinct(ID, mean, Group) %>%
  pivot_wider(names_from = Group, values_from = mean) %>%
  mutate(log2_FC = (.data[[names(group_pal)[2]]]) - (.data[[names(group_pal)[1]]])) %>%
  arrange(desc(log2_FC)) %>%
  select(ID, log2_FC) %>%
  full_join(stat, by = 'ID')

stat
order = (stat %>% arrange(log2_FC))$ID

p1 = full_join(
ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
theta %>%
  rownames_to_column('ID') %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'Deconvolution') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
by = c('sample', 'ID', 'Group')
)  %>%
  mutate(ID = factor(ID, levels = order)) %>% 
  pivot_longer(-c(ID, sample, Group)) %>%
  mutate(name = factor(name, levels = c('ssGSEA', 'Deconvolution'))) %>% 
ggplot(aes(ID, value, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  scale_color_manual(values = group_pal) +
  theme_bw() +
  theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
        axis.title = element_blank(), 
        legend.position = 'none', 
        strip.background = element_blank(), 
        panel.grid.major.y = element_line(color = "lightgrey",
                                          size = 0.3,
                                          linetype = 2)) +
  facet_wrap(.~name, scales = 'free_x') +
  coord_flip()

p1
## Warning: Removed 20 rows containing missing values (`geom_point()`).

pdf(paste0(params$out_path, sample_prefix, '.', atlas_prefix, '.pdf'))
p1
## Warning: Removed 20 rows containing missing values (`geom_point()`).
dev.off()
## png 
##   2
id = "Early_Osteoprogenitors"
plot_geneset(id)

## png 
##   2
id = "Osteoprogenitors"
plot_geneset(id)

## png 
##   2

5.3 DeMicheli

atlas_prefix = 'demicheli'
atlas_path = '/lustre06/project/6004736/alvann/from_narval/REFERENCES/Skeletal_Muscle_Tissue_hg/out/02_gene_sets/skeletal_muscle_atlas_gene_sets.n100.Rda'

ssgsea = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/xgrafts/out/04_ssGSEA/MG63/demicheli_n100.ssgsea.txt'
deconv = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/xgrafts/out/05_deconvolution/Skeletal_Muscle_Tissue_hg/MG63/bp.res.Rda'
loadRData <- function(fileName){
#loads an RData file, and returns it
    load(fileName)
    get(ls()[ls() != "fileName"])
}
atlas = loadRData(atlas_path)
load(deconv)
ssgsea = data.table::fread(ssgsea)
ssgsea
theta = get.fraction(bp=bp.res,
            which.theta="final",
            state.or.type="type") * 100

theta = theta %>% t %>% as.data.frame() 
theta
tmp = ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname'))

# calculate p-value 
stat = as.data.frame(t((rbind(by(tmp, tmp$ID, function(ssGSEA) { t.test(ssGSEA~Group, data=ssGSEA, paired=FALSE)$p.value })))))

names(stat) = 'p_val'

# adjust p-value 
stat = stat %>% 
  rownames_to_column('ID') %>%
  mutate(adj_p_val = p.adjust(p_val, method = 'fdr')) %>%
  arrange(adj_p_val)

# add log2 FC 
stat = ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')) %>%
  group_by(ID, Group) %>%
  mutate(mean = mean(ssGSEA)) %>%
  ungroup() %>%
  distinct(ID, mean, Group) %>%
  pivot_wider(names_from = Group, values_from = mean) %>%
  mutate(log2_FC = (.data[[names(group_pal)[2]]]) - (.data[[names(group_pal)[1]]])) %>%
  arrange(desc(log2_FC)) %>%
  select(ID, log2_FC) %>%
  full_join(stat, by = 'ID')

stat
order = (stat %>% arrange(log2_FC))$ID

p1 = full_join(
ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
theta %>%
  rownames_to_column('ID') %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'Deconvolution') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
by = c('sample', 'ID', 'Group')
)  %>%
  mutate(ID = factor(ID, levels = order)) %>% 
  pivot_longer(-c(ID, sample, Group)) %>%
  mutate(name = factor(name, levels = c('ssGSEA', 'Deconvolution'))) %>% 
ggplot(aes(ID, value, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  scale_color_manual(values = group_pal) +
  theme_bw() +
  theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
        axis.title = element_blank(), 
        legend.position = 'none', 
        strip.background = element_blank(), 
        panel.grid.major.y = element_line(color = "lightgrey",
                                          size = 0.3,
                                          linetype = 2)) +
  facet_wrap(.~name, scales = 'free_x') +
  coord_flip()

p1

pdf(paste0(params$out_path, sample_prefix, '.', atlas_prefix, '.pdf'))
p1
dev.off()
## png 
##   2
id = "PAX7+ DLK1+ MuSCs and progenitors"
plot_geneset(id)

## png 
##   2

6 KHOS Xenograft

sample_prefix = 'KHOS_Xgraft'
raptor_path = "/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/xgrafts/out/01_raptor_exprtools/KHOS/"
group_pal = c('blue', 'red')
names(group_pal) = data.table::fread(paste0(raptor_path, "info.groups.tsv"))$Group
meta = data.table::fread(paste0(raptor_path, "info.samples.tsv" )) 

counts = data.table::fread(paste0(raptor_path, "/counts/hg19.Ensembl.ensGene.exon.norm.tsv.gz")) %>%
  separate(V1, sep = ':', into = c('ENS', 'SYM'))
## Warning in data.table::fread(paste0(raptor_path,
## "/counts/hg19.Ensembl.ensGene.exon.norm.tsv.gz")): Detected 6 column names but
## the data has 7 columns (i.e. invalid file). Added 1 extra default column name
## for the first column which is guessed to be row names or an index. Use
## setnames() afterwards if this guess is not correct, or fix the file write
## command that created the file to create a valid file.
deg = data.table::fread(paste0(raptor_path, "/diff/hg19.Ensembl.ensGene.exon/WTvsEZHIP.tsv"))%>%
  separate(ID, sep = ':', into = c('ENS', 'SYM'))

6.1 Baccin

atlas_prefix = 'baccin'
atlas_path = '/lustre06/project/6004736/alvann/from_narval/REFERENCES/Bone_Marrow_Baccin/out/01_cluster_markers/bone_atlas_baccin_gene_sets.n100.Rda'

ssgsea = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/xgrafts/out/04_ssGSEA/KHOS/baccin_n100.ssgsea.txt'
deconv = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/xgrafts/out/05_deconvolution/Skeletal_Development_Baccin_mm/KHOS/bp.res.Rda'
loadRData <- function(fileName){
#loads an RData file, and returns it
    load(fileName)
    get(ls()[ls() != "fileName"])
}
atlas = loadRData(atlas_path)
load(deconv)
ssgsea = data.table::fread(ssgsea)
ssgsea
theta = get.fraction(bp=bp.res,
            which.theta="final",
            state.or.type="type") * 100

theta = theta %>% t %>% as.data.frame() 
theta
tmp = ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname'))

# calculate p-value 
stat = as.data.frame(t((rbind(by(tmp, tmp$ID, function(ssGSEA) { t.test(ssGSEA~Group, data=ssGSEA, paired=FALSE)$p.value })))))

names(stat) = 'p_val'

# adjust p-value 
stat = stat %>% 
  rownames_to_column('ID') %>%
  mutate(adj_p_val = p.adjust(p_val, method = 'fdr')) %>%
  arrange(adj_p_val)

# add log2 FC 
stat = ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')) %>%
  group_by(ID, Group) %>%
  mutate(mean = mean(ssGSEA)) %>%
  ungroup() %>%
  distinct(ID, mean, Group) %>%
  pivot_wider(names_from = Group, values_from = mean) %>%
  mutate(log2_FC = (.data[[names(group_pal)[2]]]) - (.data[[names(group_pal)[1]]])) %>%
  arrange(desc(log2_FC)) %>%
  select(ID, log2_FC) %>%
  full_join(stat, by = 'ID')

stat
order = (stat %>% arrange(log2_FC))$ID

p1 = full_join(
ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
theta %>%
  rownames_to_column('ID') %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'Deconvolution') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
by = c('sample', 'ID', 'Group')
)  %>%
  mutate(ID = factor(ID, levels = order)) %>% 
  pivot_longer(-c(ID, sample, Group)) %>%
  mutate(name = factor(name, levels = c('ssGSEA', 'Deconvolution'))) %>% 
ggplot(aes(ID, value, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  scale_color_manual(values = group_pal) +
  theme_bw() +
  theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
        axis.title = element_blank(), 
        legend.position = 'none', 
        strip.background = element_blank(), 
        panel.grid.major.y = element_line(color = "lightgrey",
                                          size = 0.3,
                                          linetype = 2)) +
  facet_wrap(.~name, scales = 'free_x') +
  coord_flip()

p1

pdf(paste0(params$out_path, sample_prefix, '.', atlas_prefix, '.pdf'))
p1
dev.off()
## png 
##   2
id = "Fibro/Chondro p."
plot_geneset(id)

## png 
##   2
id = "Osteo-CAR"
plot_geneset(id)

## png 
##   2
id = "Chondrocytes"
plot_geneset(id)

## png 
##   2
id = "Adipo-CAR"
plot_geneset(id)

## png 
##   2

6.2 Baryawno

atlas_prefix = 'baryawno'
atlas_path = '/lustre06/project/6004736/alvann/from_narval/REFERENCES/Skeletal_Development_Baryawno_mm/out/01_gene_sets/develop_bone_atlas_gene_sets.n100.Rda'

ssgsea = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/xgrafts/out/04_ssGSEA/KHOS/baryawno_n100.ssgsea.txt'
deconv = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/xgrafts/out/05_deconvolution/Skeletal_Development_Baryawno_mm/KHOS/bp.res.Rda'
loadRData <- function(fileName){
#loads an RData file, and returns it
    load(fileName)
    get(ls()[ls() != "fileName"])
}
atlas = loadRData(atlas_path)
load(deconv)
ssgsea = data.table::fread(ssgsea)
ssgsea
theta = get.fraction(bp=bp.res,
            which.theta="final",
            state.or.type="type") * 100

theta = theta %>% t %>% as.data.frame() 
theta
tmp = ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname'))

# calculate p-value 
stat_ssgsea = as.data.frame(t((rbind(by(tmp, tmp$ID, function(ssGSEA) { t.test(ssGSEA~Group, data=ssGSEA, paired=FALSE)$p.value })))))

names(stat_ssgsea) = 'p_val'

# adjust p-value 
stat_ssgsea = stat_ssgsea %>% 
  rownames_to_column('ID') %>%
  mutate(adj_p_val = p.adjust(p_val, method = 'fdr')) %>%
  arrange(adj_p_val)

# add log2 FC 
stat_ssgsea = ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')) %>%
  group_by(ID, Group) %>%
  mutate(mean = mean(ssGSEA)) %>%
  ungroup() %>%
  distinct(ID, mean, Group) %>%
  pivot_wider(names_from = Group, values_from = mean) %>%
  mutate(log2_FC = (.data[[names(group_pal)[2]]]) - (.data[[names(group_pal)[1]]])) %>%
  arrange(desc(log2_FC)) %>%
  select(ID, log2_FC) %>%
  full_join(stat_ssgsea, by = 'ID')

stat_ssgsea
tmp = theta %>% 
  rownames_to_column('ID') %>%
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'frac') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname'))

# calculate p-value 
stat_deconv = as.data.frame(t((rbind(by(tmp, tmp$ID, function(frac) { t.test(frac~Group, data=frac, paired=FALSE)$p.value })))))

names(stat_deconv) = 'p_val'

# adjust p-value 
stat_deconv = stat_deconv %>% 
  rownames_to_column('ID') %>%
  mutate(adj_p_val = p.adjust(p_val, method = 'fdr')) %>%
  arrange(adj_p_val)

# add log2 FC 
stat_deconv = theta %>%
  as.data.frame() %>% 
   rownames_to_column('ID') %>%
  pivot_longer(-ID, names_to = 'sample', values_to = 'frac') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')) %>%
  group_by(ID, Group) %>%
  mutate(mean = mean(frac)) %>%
  ungroup() %>%
  distinct(ID, mean, Group) %>%
  pivot_wider(names_from = Group, values_from = mean) %>%
  mutate(log2_FC = (.data[[names(group_pal)[2]]]) - (.data[[names(group_pal)[1]]])) %>%
  arrange(desc(log2_FC)) %>%
  select(ID, log2_FC) %>%
  full_join(stat_deconv, by = 'ID')

stat_deconv
full_join(stat_ssgsea, stat_deconv,  by = 'ID',  suffix = c(".ssgsea", ".deconv")) %>%
  ggplot(aes(log2(log2_FC.ssgsea+1), log2(log2_FC.deconv+1), label = ID)) +
  geom_point() +
  geom_text_repel()
## Warning in FUN(X[[i]], ...): NaNs produced

## Warning in FUN(X[[i]], ...): NaNs produced

## Warning in FUN(X[[i]], ...): NaNs produced
## Warning: Removed 7 rows containing missing values (`geom_point()`).
## Warning: Removed 7 rows containing missing values (`geom_text_repel()`).
## Warning: ggrepel: 9 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

order = (stat %>% arrange(log2_FC))$ID

p1 = full_join(
ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
theta %>%
  rownames_to_column('ID') %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'Deconvolution') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
by = c('sample', 'ID', 'Group')
)  %>%
  mutate(ID = factor(ID, levels = order)) %>% 
  pivot_longer(-c(ID, sample, Group)) %>%
  mutate(name = factor(name, levels = c('ssGSEA', 'Deconvolution'))) %>% 
ggplot(aes(ID, value, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  scale_color_manual(values = group_pal) +
  theme_bw() +
  theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
        axis.title = element_blank(), 
        legend.position = 'none', 
        strip.background = element_blank(), 
        panel.grid.major.y = element_line(color = "lightgrey",
                                          size = 0.3,
                                          linetype = 2)) +
  facet_wrap(.~name, scales = 'free_x') +
  coord_flip()

p1
## Warning: Removed 24 rows containing missing values (`geom_point()`).

pdf(paste0(params$out_path, sample_prefix, '.', atlas_prefix, '.pdf'))
p1
## Warning: Removed 24 rows containing missing values (`geom_point()`).
dev.off()
## png 
##   2
id = "OLC_1"
plot_geneset(id)
## Warning: Removed 6 rows containing missing values (`geom_point()`).
## Removed 6 rows containing missing values (`geom_point()`).

## png 
##   2
id = "Chondro_progen"
plot_geneset(id)

## png 
##   2
id = "Osteoprogenitors"
plot_geneset(id)

## png 
##   2
id = "Early_Osteoprogenitors"
plot_geneset(id)

## png 
##   2

6.3 DeMicheli

atlas_prefix = 'demicheli'
atlas_path = '/lustre06/project/6004736/alvann/from_narval/REFERENCES/Skeletal_Muscle_Tissue_hg/out/02_gene_sets/skeletal_muscle_atlas_gene_sets.n100.Rda'

ssgsea = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/xgrafts/out/04_ssGSEA/KHOS/demicheli_n100.ssgsea.txt'
deconv = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/xgrafts/out/05_deconvolution/Skeletal_Muscle_Tissue_hg/KHOS/bp.res.Rda'
loadRData <- function(fileName){
#loads an RData file, and returns it
    load(fileName)
    get(ls()[ls() != "fileName"])
}
atlas = loadRData(atlas_path)
load(deconv)
ssgsea = data.table::fread(ssgsea)
ssgsea
theta = get.fraction(bp=bp.res,
            which.theta="final",
            state.or.type="type") * 100

theta = theta %>% t %>% as.data.frame() 
theta
tmp = ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname'))

# calculate p-value 
stat = as.data.frame(t((rbind(by(tmp, tmp$ID, function(ssGSEA) { t.test(ssGSEA~Group, data=ssGSEA, paired=FALSE)$p.value })))))

names(stat) = 'p_val'

# adjust p-value 
stat = stat %>% 
  rownames_to_column('ID') %>%
  mutate(adj_p_val = p.adjust(p_val, method = 'fdr')) %>%
  arrange(adj_p_val)

# add log2 FC 
stat = ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')) %>%
  group_by(ID, Group) %>%
  mutate(mean = mean(ssGSEA)) %>%
  ungroup() %>%
  distinct(ID, mean, Group) %>%
  pivot_wider(names_from = Group, values_from = mean) %>%
  mutate(log2_FC = (.data[[names(group_pal)[2]]]) - (.data[[names(group_pal)[1]]])) %>%
  arrange(desc(log2_FC)) %>%
  select(ID, log2_FC) %>%
  full_join(stat, by = 'ID')

stat
order = (stat %>% arrange(log2_FC))$ID

p1 = full_join(
ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
theta %>%
  rownames_to_column('ID') %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'Deconvolution') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
by = c('sample', 'ID', 'Group')
)  %>%
  mutate(ID = factor(ID, levels = order)) %>% 
  pivot_longer(-c(ID, sample, Group)) %>%
  mutate(name = factor(name, levels = c('ssGSEA', 'Deconvolution'))) %>% 
ggplot(aes(ID, value, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  scale_color_manual(values = group_pal) +
  theme_bw() +
  theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
        axis.title = element_blank(), 
        legend.position = 'none', 
        strip.background = element_blank(), 
        panel.grid.major.y = element_line(color = "lightgrey",
                                          size = 0.3,
                                          linetype = 2)) +
  facet_wrap(.~name, scales = 'free_x') +
  coord_flip()

p1

pdf(paste0(params$out_path, sample_prefix, '.', atlas_prefix, '.pdf'))
p1
dev.off()
## png 
##   2
id = "ACTA2+ MYH11+ MYL9+ Smooth muscle cells"
plot_geneset(id)

## png 
##   2

7 Clinical EZHIP pos vs EZHIP neg

sample_prefix = 'clinical'
raptor_path = "/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/patient_clinical_seq/out/01_raptor_exprtools/EZHIP_neg_VsEZHIP_pos_no_reps/"
group_pal = c('blue', 'red', 'gold')
names(group_pal) = c(data.table::fread(paste0(raptor_path, "info.groups.tsv"))$Group, 'S12003')

meta = data.table::fread(paste0(raptor_path, "info.samples.tsv" )) 

counts = data.table::fread(paste0(raptor_path, "/counts/Ensembl.ensGene.exon.norm.tsv.gz")) %>%
  separate(V1, sep = ':', into = c('ENS', 'SYM'))
## Warning in data.table::fread(paste0(raptor_path,
## "/counts/Ensembl.ensGene.exon.norm.tsv.gz")): Detected 24 column names but the
## data has 25 columns (i.e. invalid file). Added 1 extra default column name for
## the first column which is guessed to be row names or an index. Use setnames()
## afterwards if this guess is not correct, or fix the file write command that
## created the file to create a valid file.
deg = data.table::fread(paste0(raptor_path, "/diff/Ensembl.ensGene.exon/EZHIP_negvsEZHIP_pos.tsv"))%>%
  separate(ID, sep = ':', into = c('ENS', 'SYM'))
genes = c('CXorf67')
p = counts %>% 
  filter(SYM %in% genes) %>%
  select(-ENS) %>%
  pivot_longer(-SYM) %>%
  left_join(meta, by = c('name' = 'Nickname')) %>%
  mutate(Group = factor(Group, levels = names(group_pal))) %>%
  ggplot(aes(Group, value, color = Group, label = Patient_ID)) +
  geom_point(size = 2, alpha = 0.8) +
  geom_text_repel() +
  facet_wrap(.~SYM, scales = 'free_y') +
  scale_color_manual(values = group_pal) +
  theme_bw() +
  theme(text = element_text(size = 10),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(),  
        axis.title.x = element_blank(), 
        legend.position = 'none', 
        strip.background = element_blank(), 
        aspect.ratio = 1) +
    scale_y_continuous(labels = label_number(suffix = "K", scale = 1e-3)) +
  ylab('Norm Counts') 
p
## Warning: ggrepel: 19 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

7.1 Baccin

atlas_prefix = 'baccin'
atlas_path = '/lustre06/project/6004736/alvann/from_narval/REFERENCES/Bone_Marrow_Baccin/out/01_cluster_markers/bone_atlas_baccin_gene_sets.n100.Rda'

ssgsea = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/patient_clinical_seq/out/04_ssGSEA/baccin_n100.ssgsea.txt'
deconv = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/patient_clinical_seq/out/05_deconvolution/Skeletal_Development_Baccin_mm/bp.res.Rda'
loadRData <- function(fileName){
#loads an RData file, and returns it
    load(fileName)
    get(ls()[ls() != "fileName"])
}
atlas = loadRData(atlas_path)
load(deconv)
ssgsea = data.table::fread(ssgsea)
ssgsea
theta = get.fraction(bp=bp.res,
            which.theta="final",
            state.or.type="type") * 100

theta = theta %>% t %>% as.data.frame() 
theta
# add log2 FC 
stat = ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')) %>%
  group_by(ID, Group) %>%
  mutate(mean = mean(ssGSEA)) %>%
  ungroup() %>%
  distinct(ID, mean, Group) %>%
  pivot_wider(names_from = Group, values_from = mean) %>%
  mutate(log2_FC = (.data[[names(group_pal)[2]]]) - (.data[[names(group_pal)[1]]])) %>%
  arrange(desc(log2_FC)) %>%
  select(ID, log2_FC)

stat
order = (stat %>% arrange(log2_FC))$ID

p1 = full_join(
ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
theta %>%
  rownames_to_column('ID') %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'Deconvolution') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
by = c('sample', 'ID', 'Group')
)  %>%
  select(ID, sample, Group, ssGSEA, Deconvolution) %>%
  mutate(ID = factor(ID, levels = order),
         Group = ifelse(sample == 'S12003', 'S12003', Group)) %>%
  pivot_longer(-c(ID, sample, Group)) %>%
  mutate(name = factor(name, levels = c('ssGSEA', 'Deconvolution'))) %>% 
ggplot(aes(ID, value, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  scale_color_manual(values = group_pal) +
  theme_bw() +
  theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
        axis.title = element_blank(), 
        legend.position = 'none', 
        strip.background = element_blank(), 
        panel.grid.major.y = element_line(color = "lightgrey",
                                          size = 0.3,
                                          linetype = 2)) +
  facet_wrap(.~name, scales = 'free_x') +
  coord_flip()

p1

pdf(paste0(params$out_path, sample_prefix, '.', atlas_prefix, '.pdf'))
p1
dev.off()
## png 
##   2
id = "Ng2+ MSCs"
plot_geneset_clin(id)

## png 
##   2
id = "Osteo-CAR"
plot_geneset_clin(id)

## png 
##   2
id = "Chondrocytes"
plot_geneset_clin(id)

## png 
##   2
id = "Adipo-CAR"
plot_geneset_clin(id)

## png 
##   2

7.2 Baryawno

atlas_prefix = 'baryawno'
atlas_path = '/lustre06/project/6004736/alvann/from_narval/REFERENCES/Skeletal_Development_Baryawno_mm/out/01_gene_sets/develop_bone_atlas_gene_sets.n100.Rda'

ssgsea = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/patient_clinical_seq/out/04_ssGSEA/baryawno_n100.ssgsea.txt'
deconv = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/patient_clinical_seq/out/05_deconvolution/Skeletal_Development_Baryawno_mm/bp.res.Rda'
loadRData <- function(fileName){
#loads an RData file, and returns it
    load(fileName)
    get(ls()[ls() != "fileName"])
}
atlas = loadRData(atlas_path)
load(deconv)
ssgsea = data.table::fread(ssgsea)
ssgsea
theta = get.fraction(bp=bp.res,
            which.theta="final",
            state.or.type="type") * 100

theta = theta %>% t %>% as.data.frame() 
theta
# add log2 FC 
stat_ssgsea = ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')) %>%
  group_by(ID, Group) %>%
  mutate(mean = mean(ssGSEA)) %>%
  ungroup() %>%
  distinct(ID, mean, Group) %>%
  pivot_wider(names_from = Group, values_from = mean) %>%
  mutate(log2_FC = (.data[[names(group_pal)[2]]]) - (.data[[names(group_pal)[1]]])) %>%
  arrange(desc(log2_FC)) %>%
  select(ID, log2_FC) 

stat_ssgsea
# add log2 FC 
stat_deconv = theta %>%
  as.data.frame() %>% 
   rownames_to_column('ID') %>%
  pivot_longer(-ID, names_to = 'sample', values_to = 'frac') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')) %>%
  group_by(ID, Group) %>%
  mutate(mean = mean(frac)) %>%
  ungroup() %>%
  distinct(ID, mean, Group) %>%
  pivot_wider(names_from = Group, values_from = mean) %>%
  mutate(log2_FC = (.data[[names(group_pal)[2]]]) - (.data[[names(group_pal)[1]]])) %>%
  arrange(desc(log2_FC)) %>%
  select(ID, log2_FC)

stat_deconv
order = (stat_ssgsea %>% arrange(log2_FC))$ID

p1 = full_join(
ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
theta %>%
  rownames_to_column('ID') %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'Deconvolution') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
by = c('sample', 'ID', 'Group')
)  %>%
  select(ID, sample, Group, ssGSEA, Deconvolution) %>%
mutate(ID = factor(ID, levels = order),
         Group = ifelse(sample == 'S12003', 'S12003', Group)) %>%
  pivot_longer(-c(ID, sample, Group)) %>%
  mutate(name = factor(name, levels = c('ssGSEA', 'Deconvolution'))) %>% 
ggplot(aes(ID, value, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  scale_color_manual(values = group_pal) +
  theme_bw() +
  theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
        axis.title = element_blank(), 
        legend.position = 'none', 
        strip.background = element_blank(), 
        panel.grid.major.y = element_line(color = "lightgrey",
                                          size = 0.3,
                                          linetype = 2)) +
  facet_wrap(.~name, scales = 'free_x') +
  coord_flip()

p1
## Warning: Removed 96 rows containing missing values (`geom_point()`).

pdf(paste0(params$out_path, sample_prefix, '.', atlas_prefix, '.pdf'))
p1
## Warning: Removed 96 rows containing missing values (`geom_point()`).
dev.off()
## png 
##   2
id = "Preosteoblasts"
plot_geneset_clin(id)

## png 
##   2
id = "Osteoprogenitors"
plot_geneset_clin(id)

## png 
##   2

7.3 DeMicheli

atlas_prefix = 'demicheli'
atlas_path = '/lustre06/project/6004736/alvann/from_narval/REFERENCES/Skeletal_Muscle_Tissue_hg/out/02_gene_sets/skeletal_muscle_atlas_gene_sets.n100.Rda'

ssgsea = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/patient_clinical_seq/out/04_ssGSEA/demicheli_n100.ssgsea.txt'
deconv = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/patient_clinical_seq/out/05_deconvolution/Skeletal_Muscle_Tissue_hg/bp.res.Rda'
loadRData <- function(fileName){
#loads an RData file, and returns it
    load(fileName)
    get(ls()[ls() != "fileName"])
}
atlas = loadRData(atlas_path)
load(deconv)
ssgsea = data.table::fread(ssgsea)
ssgsea
theta = get.fraction(bp=bp.res,
            which.theta="final",
            state.or.type="type") * 100

theta = theta %>% t %>% as.data.frame() 
theta
# add log2 FC 
stat = ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')) %>%
  group_by(ID, Group) %>%
  mutate(mean = mean(ssGSEA)) %>%
  ungroup() %>%
  distinct(ID, mean, Group) %>%
  pivot_wider(names_from = Group, values_from = mean) %>%
  mutate(log2_FC = (.data[[names(group_pal)[2]]]) - (.data[[names(group_pal)[1]]])) %>%
  arrange(desc(log2_FC)) %>%
  select(ID, log2_FC) 
stat
order = (stat %>% arrange(log2_FC))$ID

p1 = full_join(
ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
theta %>%
  rownames_to_column('ID') %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'Deconvolution')%>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
by = c('sample', 'ID', 'Group')
)  %>%
  select(ID, sample, Group, ssGSEA, Deconvolution) %>%
  mutate(ID = factor(ID, levels = order),
         Group = ifelse(sample == 'S12003', 'S12003', Group)) %>% 
  pivot_longer(-c(ID, sample, Group)) %>%
  mutate(name = factor(name, levels = c('ssGSEA', 'Deconvolution'))) %>% 
ggplot(aes(ID, value, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  scale_color_manual(values = c(group_pal, 'S12003' = 'gold')) +
  theme_bw() +
  theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
        axis.title = element_blank(), 
        legend.position = 'none', 
        strip.background = element_blank(), 
        panel.grid.major.y = element_line(color = "lightgrey",
                                          size = 0.3,
                                          linetype = 2)) +
  facet_wrap(.~name, scales = 'free_x') +
  coord_flip()

p1

pdf(paste0(params$out_path, sample_prefix, '.', atlas_prefix, '.pdf'))
p1
dev.off()
## png 
##   2
id = "PAX7+ DLK1+ MuSCs and progenitors"
plot_geneset_clin(id) 

## png 
##   2

8 Clinical EZHIP pos vs EZHIP neg (low and high samples only)

sample_prefix = 'clinical_filt'
raptor_path = "/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/patient_clinical_seq/out/01_raptor_exprtools/EZHIP_neg_VsEZHIP_pos_no_reps/"
group_pal = c('blue', 'red')
names(group_pal) = c(data.table::fread(paste0(raptor_path, "info.groups.tsv"))$Group)

meta = data.table::fread(paste0(raptor_path, "info.samples.tsv" )) 

counts = data.table::fread(paste0(raptor_path, "/counts/Ensembl.ensGene.exon.norm.tsv.gz")) %>%
  separate(V1, sep = ':', into = c('ENS', 'SYM'))
## Warning in data.table::fread(paste0(raptor_path,
## "/counts/Ensembl.ensGene.exon.norm.tsv.gz")): Detected 24 column names but the
## data has 25 columns (i.e. invalid file). Added 1 extra default column name for
## the first column which is guessed to be row names or an index. Use setnames()
## afterwards if this guess is not correct, or fix the file write command that
## created the file to create a valid file.
deg = data.table::fread(paste0(raptor_path, "/diff/Ensembl.ensGene.exon/EZHIP_negvsEZHIP_pos.tsv"))%>%
  separate(ID, sep = ':', into = c('ENS', 'SYM'))
genes = c('CXorf67')
p = counts %>% 
  filter(SYM %in% genes) %>%
  select(-ENS) %>%
  pivot_longer(-SYM) %>%
  left_join(meta, by = c('name' = 'Nickname')) %>%
  mutate(Group = factor(Group, levels = names(group_pal))) %>%
  arrange(desc(value)) %>%
  mutate(rank = dense_rank((value)),
         lab = ifelse(Group == 'EZHIP_pos',Patient_ID, NA)) %>%
  ggplot(aes(rank, value, color = Group, label = lab)) +
  geom_point(size = 2, alpha = 0.8) +
  geom_text_repel() +
  facet_wrap(.~SYM, scales = 'free_y') +
  scale_color_manual(values = group_pal) +
  theme_bw() +
  theme(text = element_text(size = 10),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(),  
        legend.position = 'none', 
        strip.background = element_blank(), 
        aspect.ratio = 1) +
    scale_y_continuous(labels = label_number(suffix = "K", scale = 1e-3)) +
  ylab('Norm Counts') +
  xlab('Rank')
p
## Warning: Removed 19 rows containing missing values (`geom_text_repel()`).

# Save top/low EZHIP IDs for later
samples = (counts %>% 
  filter(SYM %in% genes) %>%
  select(-ENS) %>%
  pivot_longer(-SYM) %>%
  left_join(meta, by = c('name' = 'Nickname')) %>%
  mutate(Group = factor(Group, levels = names(group_pal))) %>%
  arrange(desc(value)) %>%
  mutate(rank = dense_rank((value))) %>%
  filter(Group == 'EZHIP_pos' & value > 10 | Group == 'EZHIP_neg' & value == 0))$name
meta = meta %>% filter(Nickname %in% samples)
counts = counts[, c('ENS', 'SYM', samples)]
genes = c('CXorf67')
p = counts %>% 
  filter(SYM %in% genes) %>%
  select(-ENS) %>%
  pivot_longer(-SYM) %>%
  inner_join(meta, by = c('name' = 'Nickname')) %>%
  mutate(Group = factor(Group, levels = names(group_pal))) %>%
  ggplot(aes(Group, log2(value+1), color = Group, label = Patient_ID)) +
  geom_point(size = 2, alpha = 0.8) +
  geom_text_repel() +
  facet_wrap(.~SYM, scales = 'free_y') +
  scale_color_manual(values = group_pal) +
  theme_bw() +
  theme(text = element_text(size = 10),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(),  
        axis.title.x = element_blank(), 
        legend.position = 'none', 
        strip.background = element_blank(), 
        aspect.ratio = 1) +
    scale_y_continuous(labels = label_number(suffix = "K", scale = 1e-3)) +
  ylab('Log2 Norm Counts') 
p
## Warning: ggrepel: 8 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

p = counts %>% 
  filter(SYM %in% genes) %>%
  select(-ENS) %>%
  pivot_longer(-SYM) %>%
  inner_join(meta, by = c('name' = 'Nickname')) %>%
  mutate(Group = factor(Group, levels = names(group_pal))) %>%
  ggplot(aes(Group, value, color = Group, label = Patient_ID)) +
  geom_point(size = 2, alpha = 0.8) +
  geom_text_repel() +
  facet_wrap(.~SYM, scales = 'free_y') +
  scale_color_manual(values = group_pal) +
  theme_bw() +
  theme(text = element_text(size = 10),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(),  
        axis.title.x = element_blank(), 
        legend.position = 'none', 
        strip.background = element_blank(), 
        aspect.ratio = 1) +
    scale_y_continuous(labels = label_number(suffix = "K", scale = 1e-3)) +
  ylab('Norm Counts') 
p
## Warning: ggrepel: 8 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

8.1 Baccin

atlas_prefix = 'baccin'
atlas_path = '/lustre06/project/6004736/alvann/from_narval/REFERENCES/Bone_Marrow_Baccin/out/01_cluster_markers/bone_atlas_baccin_gene_sets.n100.Rda'

ssgsea = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/patient_clinical_seq/out/04_ssGSEA/baccin_n100.ssgsea.txt'
deconv = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/patient_clinical_seq/out/05_deconvolution/Skeletal_Development_Baccin_mm/bp.res.Rda'
loadRData <- function(fileName){
#loads an RData file, and returns it
    load(fileName)
    get(ls()[ls() != "fileName"])
}
atlas = loadRData(atlas_path)
load(deconv)
ssgsea = data.table::fread(ssgsea) %>% as.data.frame()
ssgsea = ssgsea[,c('ID', samples)]
ssgsea
theta = get.fraction(bp=bp.res,
            which.theta="final",
            state.or.type="type") * 100

theta = theta %>% t %>% as.data.frame() 
theta = theta[,samples]
theta
# add log2 FC 
stat = ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')) %>%
  group_by(ID, Group) %>%
  mutate(mean = mean(ssGSEA)) %>%
  ungroup() %>%
  distinct(ID, mean, Group) %>%
  pivot_wider(names_from = Group, values_from = mean) %>%
  mutate(log2_FC = (.data[[names(group_pal)[2]]]) - (.data[[names(group_pal)[1]]])) %>%
  arrange(desc(log2_FC)) %>%
  select(ID, log2_FC)

stat
order = (stat %>% arrange(log2_FC))$ID

p1 = full_join(
ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  inner_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
theta %>%
  rownames_to_column('ID') %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'Deconvolution') %>%
  inner_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
by = c('sample', 'ID', 'Group')
)  %>%
  select(ID, sample, Group, ssGSEA, Deconvolution) %>%
  mutate(ID = factor(ID, levels = order)) %>%
  pivot_longer(-c(ID, sample, Group)) %>%
  mutate(name = factor(name, levels = c('ssGSEA', 'Deconvolution'))) %>% 
  arrange(Group) %>%
ggplot(aes(ID, value, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  scale_color_manual(values = group_pal) +
  theme_bw() +
  theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
        axis.title = element_blank(), 
        legend.position = 'none', 
        strip.background = element_blank(), 
        panel.grid.major.y = element_line(color = "lightgrey",
                                          size = 0.3,
                                          linetype = 2)) +
  facet_wrap(.~name, scales = 'free_x') +
  coord_flip()

p1

pdf(paste0(params$out_path, sample_prefix, '.', atlas_prefix, '.pdf'))
p1
dev.off()
## png 
##   2
id = "Ng2+ MSCs"
plot_geneset_clin(id)

## png 
##   2
id = "Smooth muscle"
plot_geneset_clin(id)

## png 
##   2
id = "Osteo-CAR"
plot_geneset_clin(id)

## png 
##   2
id = "Chondrocytes"
plot_geneset_clin(id)

## png 
##   2
id = "Adipo-CAR"
plot_geneset_clin(id)

## png 
##   2

8.2 Baryawno

atlas_prefix = 'baryawno'
atlas_path = '/lustre06/project/6004736/alvann/from_narval/REFERENCES/Skeletal_Development_Baryawno_mm/out/01_gene_sets/develop_bone_atlas_gene_sets.n100.Rda'

ssgsea = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/patient_clinical_seq/out/04_ssGSEA/baryawno_n100.ssgsea.txt'
deconv = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/patient_clinical_seq/out/05_deconvolution/Skeletal_Development_Baryawno_mm/bp.res.Rda'
loadRData <- function(fileName){
#loads an RData file, and returns it
    load(fileName)
    get(ls()[ls() != "fileName"])
}
atlas = loadRData(atlas_path)
load(deconv)
ssgsea = data.table::fread(ssgsea) %>% as.data.frame()
ssgsea = ssgsea[,c('ID', samples)]
ssgsea
theta = get.fraction(bp=bp.res,
            which.theta="final",
            state.or.type="type") * 100

theta = theta %>% t %>% as.data.frame() 
theta = theta[,samples]
theta
# add log2 FC 
stat_ssgsea = ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')) %>%
  group_by(ID, Group) %>%
  mutate(mean = mean(ssGSEA)) %>%
  ungroup() %>%
  distinct(ID, mean, Group) %>%
  pivot_wider(names_from = Group, values_from = mean) %>%
  mutate(log2_FC = (.data[[names(group_pal)[2]]]) - (.data[[names(group_pal)[1]]])) %>%
  arrange(desc(log2_FC)) %>%
  select(ID, log2_FC) 

stat_ssgsea
# add log2 FC 
stat_deconv = theta %>%
  as.data.frame() %>% 
   rownames_to_column('ID') %>%
  pivot_longer(-ID, names_to = 'sample', values_to = 'frac') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')) %>%
  group_by(ID, Group) %>%
  mutate(mean = mean(frac)) %>%
  ungroup() %>%
  distinct(ID, mean, Group) %>%
  pivot_wider(names_from = Group, values_from = mean) %>%
  mutate(log2_FC = (.data[[names(group_pal)[2]]]) - (.data[[names(group_pal)[1]]])) %>%
  arrange(desc(log2_FC)) %>%
  select(ID, log2_FC)

stat_deconv
order = (stat_ssgsea %>% arrange(log2_FC))$ID

p1 = full_join(
ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
theta %>%
  rownames_to_column('ID') %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'Deconvolution') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
by = c('sample', 'ID', 'Group')
)  %>%
  select(ID, sample, Group, ssGSEA, Deconvolution) %>%
mutate(ID = factor(ID, levels = order)) %>%
  pivot_longer(-c(ID, sample, Group)) %>%
  mutate(name = factor(name, levels = c('ssGSEA', 'Deconvolution'))) %>% 
  arrange(Group) %>%
ggplot(aes(ID, value, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  scale_color_manual(values = group_pal) +
  theme_bw() +
  theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
        axis.title = element_blank(), 
        legend.position = 'none', 
        strip.background = element_blank(), 
        panel.grid.major.y = element_line(color = "lightgrey",
                                          size = 0.3,
                                          linetype = 2)) +
  facet_wrap(.~name, scales = 'free_x') +
  coord_flip()

p1
## Warning: Removed 44 rows containing missing values (`geom_point()`).

pdf(paste0(params$out_path, sample_prefix, '.', atlas_prefix, '.pdf'))
p1
## Warning: Removed 44 rows containing missing values (`geom_point()`).
dev.off()
## png 
##   2
id = "Preosteoblasts"
plot_geneset_clin(id)

## png 
##   2
id = "Osteoprogenitors"
plot_geneset_clin(id)

## png 
##   2

8.3 DeMicheli

atlas_prefix = 'demicheli'
atlas_path = '/lustre06/project/6004736/alvann/from_narval/REFERENCES/Skeletal_Muscle_Tissue_hg/out/02_gene_sets/skeletal_muscle_atlas_gene_sets.n100.Rda'

ssgsea = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/patient_clinical_seq/out/04_ssGSEA/demicheli_n100.ssgsea.txt'
deconv = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/patient_clinical_seq/out/05_deconvolution/Skeletal_Muscle_Tissue_hg/bp.res.Rda'
loadRData <- function(fileName){
#loads an RData file, and returns it
    load(fileName)
    get(ls()[ls() != "fileName"])
}
atlas = loadRData(atlas_path)
load(deconv)
ssgsea = data.table::fread(ssgsea) %>% as.data.frame()
ssgsea = ssgsea[, c('ID', samples)]
ssgsea
theta = get.fraction(bp=bp.res,
            which.theta="final",
            state.or.type="type") * 100

theta = theta %>% t %>% as.data.frame() 
theta = theta[, samples]
theta
# add log2 FC 
stat = ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')) %>%
  group_by(ID, Group) %>%
  mutate(mean = mean(ssGSEA)) %>%
  ungroup() %>%
  distinct(ID, mean, Group) %>%
  pivot_wider(names_from = Group, values_from = mean) %>%
  mutate(log2_FC = (.data[[names(group_pal)[2]]]) - (.data[[names(group_pal)[1]]])) %>%
  arrange(desc(log2_FC)) %>%
  select(ID, log2_FC) 
stat
order = (stat %>% arrange(log2_FC))$ID

p1 = full_join(
ssgsea %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'ssGSEA') %>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
theta %>%
  rownames_to_column('ID') %>% 
  as.data.frame() %>% 
  pivot_longer(-ID, names_to = 'sample', values_to = 'Deconvolution')%>%
  left_join(meta %>% select(-ID), by = c('sample' = 'Nickname')),
by = c('sample', 'ID', 'Group')
)  %>%
  select(ID, sample, Group, ssGSEA, Deconvolution) %>%
  mutate(ID = factor(ID, levels = order)) %>% 
  arrange(Group) %>%
  pivot_longer(-c(ID, sample, Group)) %>%
  mutate(name = factor(name, levels = c('ssGSEA', 'Deconvolution'))) %>% 
ggplot(aes(ID, value, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  scale_color_manual(values = group_pal) +
  theme_bw() +
  theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
        axis.title = element_blank(), 
        legend.position = 'none', 
        strip.background = element_blank(), 
        panel.grid.major.y = element_line(color = "lightgrey",
                                          size = 0.3,
                                          linetype = 2)) +
  facet_wrap(.~name, scales = 'free_x') +
  coord_flip()

p1

pdf(paste0(params$out_path, sample_prefix, '.', atlas_prefix, '.pdf'))
p1
dev.off()
## png 
##   2
id = 'ACTA1+ Mature skeletal muscle'
plot_geneset_clin(id) 

## png 
##   2
id = "PAX7+ DLK1+ MuSCs and progenitors"
plot_geneset_clin(id) 

## png 
##   2

9 Joint clustering

9.1 Baccin

pattern = 'baccin_n100.ssgsea.txt'
group_pal = c('WT' = 'blue', 'EZHIP' = 'red')

9.1.1 Cancer Cell Lines

atlas_prefix = 'baccin'
atlas_path = '/lustre06/project/6004736/alvann/from_narval/REFERENCES/Bone_Marrow_Baccin/out/01_cluster_markers/bone_atlas_baccin_gene_sets.n100.Rda'

ssgsea_files = list.files(path = "/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/", 
           pattern = 'baccin_n100.ssgsea.txt', 
           full.names = T,
           recursive = T) %>%
  data.frame(group = c('hMSC_cell', 'KHOS_cell', 'MG63_cell', 'clinical', 'U2OS_cell', 'KHOS_xeno', 'MG63_xeno'))
names(ssgsea_files)[1] = c('path')
ssgsea_files = filter(ssgsea_files, group %in% c('KHOS_cell', 'MG63_cell', 'U2OS_cell'))

l = list()
for(g in ssgsea_files$group){
  print(g)
  
  l[[g]] = data.table::fread(ssgsea_files[ssgsea_files$group == g,]$path) 
}
## [1] "KHOS_cell"
## [1] "MG63_cell"
## [1] "U2OS_cell"
df = do.call(cbind, l) 
names(df)[1] = 'tmp' 
df = df %>% 
  select(!ends_with('ID')) %>%
  column_to_rownames('tmp')

meta = data.frame(Sample = names(df)) %>%
  separate(Sample, sep = '[.]', into = c('Comp', 'Sample_Short'), remove = F) %>%
  separate(Comp, sep = '_', into = c('Cell_Line', 'Type'), remove = F) %>%
  mutate(Group = ifelse(grepl('EZHIP|C1', Sample), 'EZHIP', 'WT'))
pca = prcomp(df %>% t)

var = tibble(PC = factor(1:length(pca$sdev^2)), 
                         variance = pca$sdev) %>% 
  mutate(pct = variance/sum(variance)*100)

for(i in 1:4){
  pc.1 = paste0('PC', i)
  pc.2 = paste0('PC', i+1)
  print(pc.1)
  print(pc.2)
  
 pca_plot = pca$x %>% 
  as.data.frame() %>%
  rownames_to_column('Sample') %>%
  left_join(meta) %>% 
  ggplot(aes(x=.data[[pc.1]], y=.data[[pc.2]], color = Group, label = Comp)) +
  scale_color_manual(values = group_pal) +
    geom_point(size = 4) +
  ggrepel::geom_text_repel(show.legend  = F, box.padding = 0.5, max.overlaps = Inf, color="black" ) +
    theme_bw() +
    theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        aspect.ratio = 1, 
        legend.position = 'none') + 
  xlab(paste0(pc.1, ' ', round(var[i,]$pct, digits = 1), '%')) +
  ylab(paste0(pc.2, ' ', round(var[i+1,]$pct, digits = 1), '%')) 
print(pca_plot)
}
## [1] "PC1"
## [1] "PC2"
## Joining with `by = join_by(Sample)`
## [1] "PC2"
## [1] "PC3"
## Joining with `by = join_by(Sample)`

## [1] "PC3"
## [1] "PC4"
## Joining with `by = join_by(Sample)`

## [1] "PC4"
## [1] "PC5"
## Joining with `by = join_by(Sample)`

9.1.2 Cancer Cell Lines + hMSC

atlas_prefix = 'baccin'
atlas_path = '/lustre06/project/6004736/alvann/from_narval/REFERENCES/Bone_Marrow_Baccin/out/01_cluster_markers/bone_atlas_baccin_gene_sets.n100.Rda'

ssgsea_files = list.files(path = "/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/", 
           pattern = 'baccin_n100.ssgsea.txt', 
           full.names = T,
           recursive = T) %>%
  data.frame(group = c('hMSC_cell', 'KHOS_cell', 'MG63_cell', 'clinical', 'U2OS_cell', 'KHOS_xeno', 'MG63_xeno'))
names(ssgsea_files)[1] = c('path')
ssgsea_files = filter(ssgsea_files, group %in% c('KHOS_cell', 'MG63_cell', 'U2OS_cell', 'hMSC_cell'))

l = list()
for(g in ssgsea_files$group){
  print(g)
  
  l[[g]] = data.table::fread(ssgsea_files[ssgsea_files$group == g,]$path) 
}
## [1] "hMSC_cell"
## [1] "KHOS_cell"
## [1] "MG63_cell"
## [1] "U2OS_cell"
df = do.call(cbind, l) 
names(df)[1] = 'tmp' 
df = df %>% 
  select(!ends_with('ID')) %>%
  column_to_rownames('tmp')

meta = data.frame(Sample = names(df)) %>%
  separate(Sample, sep = '[.]', into = c('Comp', 'Sample_Short'), remove = F) %>%
  separate(Comp, sep = '_', into = c('Cell_Line', 'Type'), remove = F) %>%
  mutate(Group = ifelse(grepl('EZHIP|C1', Sample), 'EZHIP', 'WT'))
pca = prcomp(df %>% t)

var = tibble(PC = factor(1:length(pca$sdev^2)), 
                         variance = pca$sdev) %>% 
  mutate(pct = variance/sum(variance)*100)

for(i in 1:5){
  pc.1 = paste0('PC', i)
  pc.2 = paste0('PC', i+1)
  print(pc.1)
  print(pc.2)
  
 pca_plot = pca$x %>% 
  as.data.frame() %>%
  rownames_to_column('Sample') %>%
  left_join(meta) %>% 
  ggplot(aes(x=.data[[pc.1]], y=.data[[pc.2]], color = Group, label = Comp)) +
  scale_color_manual(values = group_pal) +
    geom_point(size = 4) +
  ggrepel::geom_text_repel(show.legend  = F, box.padding = 0.5, max.overlaps = Inf, color="black" ) +
    theme_bw() +
    theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        aspect.ratio = 1, 
        legend.position = 'none') + 
  xlab(paste0(pc.1, ' ', round(var[i,]$pct, digits = 1), '%')) +
  ylab(paste0(pc.2, ' ', round(var[i+1,]$pct, digits = 1), '%')) 
print(pca_plot)
}
## [1] "PC1"
## [1] "PC2"
## Joining with `by = join_by(Sample)`
## [1] "PC2"
## [1] "PC3"
## Joining with `by = join_by(Sample)`

## [1] "PC3"
## [1] "PC4"
## Joining with `by = join_by(Sample)`

## [1] "PC4"
## [1] "PC5"
## Joining with `by = join_by(Sample)`

## [1] "PC5"
## [1] "PC6"
## Joining with `by = join_by(Sample)`

9.1.3 Cancer Cell Lines + Xgrafts

atlas_prefix = 'baccin'
atlas_path = '/lustre06/project/6004736/alvann/from_narval/REFERENCES/Bone_Marrow_Baccin/out/01_cluster_markers/bone_atlas_baccin_gene_sets.n100.Rda'

ssgsea_files = list.files(path = "/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/", 
           pattern = 'baccin_n100.ssgsea.txt', 
           full.names = T,
           recursive = T) %>%
  data.frame(group = c('hMSC_cell', 'KHOS_cell', 'MG63_cell', 'clinical', 'U2OS_cell', 'KHOS_xeno', 'MG63_xeno'))
names(ssgsea_files)[1] = c('path')
ssgsea_files = filter(ssgsea_files, group %in% c('KHOS_cell', 'MG63_cell', 'U2OS_cell', 'KHOS_xeno', 'MG63_xeno'))

l = list()
for(g in ssgsea_files$group){
  print(g)
  
  l[[g]] = data.table::fread(ssgsea_files[ssgsea_files$group == g,]$path) 
}
## [1] "KHOS_cell"
## [1] "MG63_cell"
## [1] "U2OS_cell"
## [1] "KHOS_xeno"
## [1] "MG63_xeno"
df = do.call(cbind, l) 
names(df)[1] = 'tmp' 
df = df %>% 
  select(!ends_with('ID')) %>%
  column_to_rownames('tmp')

meta = data.frame(Sample = names(df)) %>%
  separate(Sample, sep = '[.]', into = c('Comp', 'Sample_Short'), remove = F) %>%
  separate(Comp, sep = '_', into = c('Cell_Line', 'Type'), remove = F) %>%
  mutate(Group = ifelse(grepl('EZHIP|C1', Sample), 'EZHIP', 'WT'))
pca = prcomp(df %>% t)

var = tibble(PC = factor(1:length(pca$sdev^2)), 
                         variance = pca$sdev) %>% 
  mutate(pct = variance/sum(variance)*100)

for(i in 1:5){
  pc.1 = paste0('PC', i)
  pc.2 = paste0('PC', i+1)
  print(pc.1)
  print(pc.2)
  
 pca_plot = pca$x %>% 
  as.data.frame() %>%
  rownames_to_column('Sample') %>%
  left_join(meta) %>% 
  ggplot(aes(x=.data[[pc.1]], y=.data[[pc.2]], color = Group, label = Comp)) +
  scale_color_manual(values = group_pal) +
    geom_point(size = 4) +
  ggrepel::geom_text_repel(show.legend  = F, box.padding = 0.5, max.overlaps = Inf, color="black" ) +
    theme_bw() +
    theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        aspect.ratio = 1, 
        legend.position = 'none') + 
  xlab(paste0(pc.1, ' ', round(var[i,]$pct, digits = 1), '%')) +
  ylab(paste0(pc.2, ' ', round(var[i+1,]$pct, digits = 1), '%')) 
print(pca_plot)
}
## [1] "PC1"
## [1] "PC2"
## Joining with `by = join_by(Sample)`
## [1] "PC2"
## [1] "PC3"
## Joining with `by = join_by(Sample)`

## [1] "PC3"
## [1] "PC4"
## Joining with `by = join_by(Sample)`

## [1] "PC4"
## [1] "PC5"
## Joining with `by = join_by(Sample)`

## [1] "PC5"
## [1] "PC6"
## Joining with `by = join_by(Sample)`

9.1.4 KHOS MG63 Cell Lines + Xgrafts

atlas_prefix = 'baccin'
atlas_path = '/lustre06/project/6004736/alvann/from_narval/REFERENCES/Bone_Marrow_Baccin/out/01_cluster_markers/bone_atlas_baccin_gene_sets.n100.Rda'

ssgsea_files = list.files(path = "/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/", 
           pattern = 'baccin_n100.ssgsea.txt', 
           full.names = T,
           recursive = T) %>%
  data.frame(group = c('hMSC_cell', 'KHOS_cell', 'MG63_cell', 'clinical', 'U2OS_cell', 'KHOS_xeno', 'MG63_xeno'))
names(ssgsea_files)[1] = c('path')
ssgsea_files = filter(ssgsea_files, group %in% c('KHOS_cell', 'MG63_cell', 'KHOS_xeno', 'MG63_xeno'))

l = list()
for(g in ssgsea_files$group){
  print(g)
  
  l[[g]] = data.table::fread(ssgsea_files[ssgsea_files$group == g,]$path) 
}
## [1] "KHOS_cell"
## [1] "MG63_cell"
## [1] "KHOS_xeno"
## [1] "MG63_xeno"
df = do.call(cbind, l) 
names(df)[1] = 'tmp' 
df = df %>% 
  select(!ends_with('ID')) %>%
  column_to_rownames('tmp')

meta = data.frame(Sample = names(df)) %>%
  separate(Sample, sep = '[.]', into = c('Comp', 'Sample_Short'), remove = F) %>%
  separate(Comp, sep = '_', into = c('Cell_Line', 'Type'), remove = F) %>%
  mutate(Group = ifelse(grepl('EZHIP|C1', Sample), 'EZHIP', 'WT'))
pca = prcomp(df %>% t)

var = tibble(PC = factor(1:length(pca$sdev^2)), 
                         variance = pca$sdev) %>% 
  mutate(pct = variance/sum(variance)*100)

for(i in 1:5){
  pc.1 = paste0('PC', i)
  pc.2 = paste0('PC', i+1)
  print(pc.1)
  print(pc.2)
  
 pca_plot = pca$x %>% 
  as.data.frame() %>%
  rownames_to_column('Sample') %>%
  left_join(meta) %>% 
  ggplot(aes(x=.data[[pc.1]], y=.data[[pc.2]], color = Group, label = Comp)) +
  scale_color_manual(values = group_pal) +
    geom_point(size = 4) +
  ggrepel::geom_text_repel(show.legend  = F, box.padding = 0.5, max.overlaps = Inf, color="black" ) +
    theme_bw() +
    theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        aspect.ratio = 1, 
        legend.position = 'none') + 
  xlab(paste0(pc.1, ' ', round(var[i,]$pct, digits = 1), '%')) +
  ylab(paste0(pc.2, ' ', round(var[i+1,]$pct, digits = 1), '%')) 
print(pca_plot)
}
## [1] "PC1"
## [1] "PC2"
## Joining with `by = join_by(Sample)`
## [1] "PC2"
## [1] "PC3"
## Joining with `by = join_by(Sample)`

## [1] "PC3"
## [1] "PC4"
## Joining with `by = join_by(Sample)`

## [1] "PC4"
## [1] "PC5"
## Joining with `by = join_by(Sample)`

## [1] "PC5"
## [1] "PC6"
## Joining with `by = join_by(Sample)`

9.1.5 MG63 Cell Lines + Xgrafts

atlas_prefix = 'baccin'
atlas_path = '/lustre06/project/6004736/alvann/from_narval/REFERENCES/Bone_Marrow_Baccin/out/01_cluster_markers/bone_atlas_baccin_gene_sets.n100.Rda'

ssgsea_files = list.files(path = "/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/", 
           pattern = pattern, 
           full.names = T,
           recursive = T) %>%
  data.frame(group = c('hMSC_cell', 'KHOS_cell', 'MG63_cell', 'clinical', 'U2OS_cell', 'KHOS_xeno', 'MG63_xeno'))
names(ssgsea_files)[1] = c('path')
ssgsea_files = filter(ssgsea_files, group %in% c('MG63_cell', 'MG63_xeno'))

l = list()
for(g in ssgsea_files$group){
  print(g)
  
  l[[g]] = data.table::fread(ssgsea_files[ssgsea_files$group == g,]$path) 
}
## [1] "MG63_cell"
## [1] "MG63_xeno"
df = do.call(cbind, l) 
names(df)[1] = 'tmp' 
df = df %>% 
  select(!ends_with('ID')) %>%
  column_to_rownames('tmp')

meta = data.frame(Sample = names(df)) %>%
  separate(Sample, sep = '[.]', into = c('Comp', 'Sample_Short'), remove = F) %>%
  separate(Comp, sep = '_', into = c('Cell_Line', 'Type'), remove = F) %>%
  mutate(Group = ifelse(grepl('EZHIP|C1', Sample), 'EZHIP', 'WT'))
pca = prcomp(df %>% t)

var = tibble(PC = factor(1:length(pca$sdev^2)), 
                         variance = pca$sdev) %>% 
  mutate(pct = variance/sum(variance)*100)

for(i in 1:5){
  pc.1 = paste0('PC', i)
  pc.2 = paste0('PC', i+1)
  print(pc.1)
  print(pc.2)
  
 pca_plot = pca$x %>% 
  as.data.frame() %>%
  rownames_to_column('Sample') %>%
  left_join(meta) %>% 
  ggplot(aes(x=.data[[pc.1]], y=.data[[pc.2]], color = Group, label = Comp)) +
  scale_color_manual(values = group_pal) +
    geom_point(size = 4) +
  ggrepel::geom_text_repel(show.legend  = F, box.padding = 0.5, max.overlaps = Inf, color="black" ) +
    theme_bw() +
    theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        aspect.ratio = 1, 
        legend.position = 'none') + 
  xlab(paste0(pc.1, ' ', round(var[i,]$pct, digits = 1), '%')) +
  ylab(paste0(pc.2, ' ', round(var[i+1,]$pct, digits = 1), '%')) 
print(pca_plot)
}
## [1] "PC1"
## [1] "PC2"
## Joining with `by = join_by(Sample)`
## [1] "PC2"
## [1] "PC3"
## Joining with `by = join_by(Sample)`

## [1] "PC3"
## [1] "PC4"
## Joining with `by = join_by(Sample)`

## [1] "PC4"
## [1] "PC5"
## Joining with `by = join_by(Sample)`

## [1] "PC5"
## [1] "PC6"
## Joining with `by = join_by(Sample)`

9.1.6 KHOS Cell Lines + Xgrafts

atlas_prefix = 'baccin'
atlas_path = '/lustre06/project/6004736/alvann/from_narval/REFERENCES/Bone_Marrow_Baccin/out/01_cluster_markers/bone_atlas_baccin_gene_sets.n100.Rda'

ssgsea_files = list.files(path = "/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/", 
           pattern = pattern, 
           full.names = T,
           recursive = T) %>%
  data.frame(group = c('hMSC_cell', 'KHOS_cell', 'MG63_cell', 'clinical', 'U2OS_cell', 'KHOS_xeno', 'MG63_xeno'))
names(ssgsea_files)[1] = c('path')
ssgsea_files = filter(ssgsea_files, group %in% c('KHOS_cell', 'KHOS_xeno'))

l = list()
for(g in ssgsea_files$group){
  print(g)
  
  l[[g]] = data.table::fread(ssgsea_files[ssgsea_files$group == g,]$path) 
}
## [1] "KHOS_cell"
## [1] "KHOS_xeno"
df = do.call(cbind, l) 
names(df)[1] = 'tmp' 
df = df %>% 
  select(!ends_with('ID')) %>%
  column_to_rownames('tmp')

meta = data.frame(Sample = names(df)) %>%
  separate(Sample, sep = '[.]', into = c('Comp', 'Sample_Short'), remove = F) %>%
  separate(Comp, sep = '_', into = c('Cell_Line', 'Type'), remove = F) %>%
  mutate(Group = ifelse(grepl('EZHIP|C1', Sample), 'EZHIP', 'WT'))
pca = prcomp(df %>% t)

var = tibble(PC = factor(1:length(pca$sdev^2)), 
                         variance = pca$sdev) %>% 
  mutate(pct = variance/sum(variance)*100)

for(i in 1:5){
  pc.1 = paste0('PC', i)
  pc.2 = paste0('PC', i+1)
  print(pc.1)
  print(pc.2)
  
 pca_plot = pca$x %>% 
  as.data.frame() %>%
  rownames_to_column('Sample') %>%
  left_join(meta) %>% 
  ggplot(aes(x=.data[[pc.1]], y=.data[[pc.2]], color = Group, label = Comp)) +
  scale_color_manual(values = group_pal) +
    geom_point(size = 4) +
  ggrepel::geom_text_repel(show.legend  = F, box.padding = 0.5, max.overlaps = Inf, color="black" ) +
    theme_bw() +
    theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        aspect.ratio = 1, 
        legend.position = 'none') + 
  xlab(paste0(pc.1, ' ', round(var[i,]$pct, digits = 1), '%')) +
  ylab(paste0(pc.2, ' ', round(var[i+1,]$pct, digits = 1), '%')) 
print(pca_plot)
}
## [1] "PC1"
## [1] "PC2"
## Joining with `by = join_by(Sample)`
## [1] "PC2"
## [1] "PC3"
## Joining with `by = join_by(Sample)`

## [1] "PC3"
## [1] "PC4"
## Joining with `by = join_by(Sample)`

## [1] "PC4"
## [1] "PC5"
## Joining with `by = join_by(Sample)`

## [1] "PC5"
## [1] "PC6"
## Joining with `by = join_by(Sample)`

9.1.7 U2OS + Xgrafts

atlas_prefix = 'baccin'
atlas_path = '/lustre06/project/6004736/alvann/from_narval/REFERENCES/Bone_Marrow_Baccin/out/01_cluster_markers/bone_atlas_baccin_gene_sets.n100.Rda'

ssgsea_files = list.files(path = "/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/", 
           pattern = 'baccin_n100.ssgsea.txt', 
           full.names = T,
           recursive = T) %>%
  data.frame(group = c('hMSC_cell', 'KHOS_cell', 'MG63_cell', 'clinical', 'U2OS_cell', 'KHOS_xeno', 'MG63_xeno'))
names(ssgsea_files)[1] = c('path')
ssgsea_files = filter(ssgsea_files, group %in% c('U2OS_cell', 'KHOS_xeno', 'MG63_xeno'))

l = list()
for(g in ssgsea_files$group){
  print(g)
  
  l[[g]] = data.table::fread(ssgsea_files[ssgsea_files$group == g,]$path) 
}
## [1] "U2OS_cell"
## [1] "KHOS_xeno"
## [1] "MG63_xeno"
df = do.call(cbind, l) 
names(df)[1] = 'tmp' 
df = df %>% 
  select(!ends_with('ID')) %>%
  column_to_rownames('tmp')

meta = data.frame(Sample = names(df)) %>%
  separate(Sample, sep = '[.]', into = c('Comp', 'Sample_Short'), remove = F) %>%
  separate(Comp, sep = '_', into = c('Cell_Line', 'Type'), remove = F) %>%
  mutate(Group = ifelse(grepl('EZHIP|C1', Sample), 'EZHIP', 'WT'))
pca = prcomp(df %>% t)

var = tibble(PC = factor(1:length(pca$sdev^2)), 
                         variance = pca$sdev) %>% 
  mutate(pct = variance/sum(variance)*100)

for(i in 1:5){
  pc.1 = paste0('PC', i)
  pc.2 = paste0('PC', i+1)
  print(pc.1)
  print(pc.2)
  
 pca_plot = pca$x %>% 
  as.data.frame() %>%
  rownames_to_column('Sample') %>%
  left_join(meta) %>% 
  ggplot(aes(x=.data[[pc.1]], y=.data[[pc.2]], color = Group, label = Comp)) +
  scale_color_manual(values = group_pal) +
    geom_point(size = 4) +
  ggrepel::geom_text_repel(show.legend  = F, box.padding = 0.5, max.overlaps = Inf, color="black" ) +
    theme_bw() +
    theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        aspect.ratio = 1, 
        legend.position = 'none') + 
  xlab(paste0(pc.1, ' ', round(var[i,]$pct, digits = 1), '%')) +
  ylab(paste0(pc.2, ' ', round(var[i+1,]$pct, digits = 1), '%')) 
print(pca_plot)
}
## [1] "PC1"
## [1] "PC2"
## Joining with `by = join_by(Sample)`
## [1] "PC2"
## [1] "PC3"
## Joining with `by = join_by(Sample)`

## [1] "PC3"
## [1] "PC4"
## Joining with `by = join_by(Sample)`

## [1] "PC4"
## [1] "PC5"
## Joining with `by = join_by(Sample)`

## [1] "PC5"
## [1] "PC6"
## Joining with `by = join_by(Sample)`

9.1.8 Xgrafts

atlas_prefix = 'baccin'
atlas_path = '/lustre06/project/6004736/alvann/from_narval/REFERENCES/Bone_Marrow_Baccin/out/01_cluster_markers/bone_atlas_baccin_gene_sets.n100.Rda'

ssgsea_files = list.files(path = "/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/", 
           pattern = 'baccin_n100.ssgsea.txt', 
           full.names = T,
           recursive = T) %>%
  data.frame(group = c('hMSC_cell', 'KHOS_cell', 'MG63_cell', 'clinical', 'U2OS_cell', 'KHOS_xeno', 'MG63_xeno'))
names(ssgsea_files)[1] = c('path')
ssgsea_files = filter(ssgsea_files, group %in% c('KHOS_xeno', 'MG63_xeno'))

l = list()
for(g in ssgsea_files$group){
  print(g)
  
  l[[g]] = data.table::fread(ssgsea_files[ssgsea_files$group == g,]$path) 
}
## [1] "KHOS_xeno"
## [1] "MG63_xeno"
df = do.call(cbind, l) 
names(df)[1] = 'tmp' 
df = df %>% 
  select(!ends_with('ID')) %>%
  column_to_rownames('tmp')

meta = data.frame(Sample = names(df)) %>%
  separate(Sample, sep = '[.]', into = c('Comp', 'Sample_Short'), remove = F) %>%
  separate(Comp, sep = '_', into = c('Cell_Line', 'Type'), remove = F) %>%
  mutate(Group = ifelse(grepl('EZHIP|C1', Sample), 'EZHIP', 'WT'))
pca = prcomp(df %>% t)

var = tibble(PC = factor(1:length(pca$sdev^2)), 
                         variance = pca$sdev) %>% 
  mutate(pct = variance/sum(variance)*100)

for(i in 1:5){
  pc.1 = paste0('PC', i)
  pc.2 = paste0('PC', i+1)
  print(pc.1)
  print(pc.2)
  
 pca_plot = pca$x %>% 
  as.data.frame() %>%
  rownames_to_column('Sample') %>%
  left_join(meta) %>% 
  ggplot(aes(x=.data[[pc.1]], y=.data[[pc.2]], color = Group, label = Comp)) +
  scale_color_manual(values = group_pal) +
    geom_point(size = 4) +
  ggrepel::geom_text_repel(show.legend  = F, box.padding = 0.5, max.overlaps = Inf, color="black" ) +
    theme_bw() +
    theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        aspect.ratio = 1, 
        legend.position = 'none') + 
  xlab(paste0(pc.1, ' ', round(var[i,]$pct, digits = 1), '%')) +
  ylab(paste0(pc.2, ' ', round(var[i+1,]$pct, digits = 1), '%')) 
print(pca_plot)
}
## [1] "PC1"
## [1] "PC2"
## Joining with `by = join_by(Sample)`
## [1] "PC2"
## [1] "PC3"
## Joining with `by = join_by(Sample)`

## [1] "PC3"
## [1] "PC4"
## Joining with `by = join_by(Sample)`

## [1] "PC4"
## [1] "PC5"
## Joining with `by = join_by(Sample)`

## [1] "PC5"
## [1] "PC6"
## Joining with `by = join_by(Sample)`

9.1.9 Clinical

group_pal = c('EZHIP_neg' = 'blue', 'EZHIP_pos' = 'red')
clical_meta = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/patient_clinical_seq/out/01_raptor_exprtools/EZHIP_neg_VsEZHIP_pos_no_reps/info.samples.tsv'

ssgsea_files = list.files(path = "/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/", 
           pattern = pattern, 
           full.names = T,
           recursive = T) %>%
  data.frame(group = c('hMSC_cell', 'KHOS_cell', 'MG63_cell', 'clinical', 'U2OS_cell', 'KHOS_xeno', 'MG63_xeno'))
names(ssgsea_files)[1] = c('path')
ssgsea_files = filter(ssgsea_files, group %in% c('clinical'))

l = list()
for(g in ssgsea_files$group){
  print(g)
  
  l[[g]] = data.table::fread(ssgsea_files[ssgsea_files$group == g,]$path) 
}
## [1] "clinical"
df = do.call(cbind, l) 
names(df)[1] = 'tmp' 
df = df %>% 
  select(!ends_with('ID')) %>%
  column_to_rownames('tmp')

meta = data.table::fread(clical_meta) %>% 
  mutate(Sample = paste0('clinical.', Nickname)) 

pal = c('EZHIP_pos' = 'red', 'EZHIP_neg' = 'blue', 'H3K27me3_Low' = 'deepskyblue')
pca = prcomp(df %>% t)

var = tibble(PC = factor(1:length(pca$sdev^2)), 
                         variance = pca$sdev) %>% 
  mutate(pct = variance/sum(variance)*100)

for(i in 1:5){
  pc.1 = paste0('PC', i)
  pc.2 = paste0('PC', i+1)
  print(pc.1)
  print(pc.2)
  
 pca_plot = pca$x %>% 
  as.data.frame() %>%
  rownames_to_column('Sample') %>%
  left_join(meta) %>% 
  ggplot(aes(x=.data[[pc.1]], y=.data[[pc.2]], color = Group, label = Nickname)) +
  scale_color_manual(values = pal) +
    geom_point(size = 4) +
  ggrepel::geom_text_repel(show.legend  = F, box.padding = 0.5, max.overlaps = Inf, color="black", size = 2) +
    theme_bw() +
    theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        aspect.ratio = 1, 
        legend.position = 'none') + 
  xlab(paste0(pc.1, ' ', round(var[i,]$pct, digits = 1), '%')) +
  ylab(paste0(pc.2, ' ', round(var[i+1,]$pct, digits = 1), '%')) 
print(pca_plot)
}
## [1] "PC1"
## [1] "PC2"
## Joining with `by = join_by(Sample)`
## [1] "PC2"
## [1] "PC3"
## Joining with `by = join_by(Sample)`

## [1] "PC3"
## [1] "PC4"
## Joining with `by = join_by(Sample)`

## [1] "PC4"
## [1] "PC5"
## Joining with `by = join_by(Sample)`

## [1] "PC5"
## [1] "PC6"
## Joining with `by = join_by(Sample)`

meta = meta %>% filter(Nickname %in% samples)
df = df[,meta$Sample]
pca = prcomp(df %>% t)

var = tibble(PC = factor(1:length(pca$sdev^2)), 
                         variance = pca$sdev) %>% 
  mutate(pct = variance/sum(variance)*100)

for(i in 1:5){
  pc.1 = paste0('PC', i)
  pc.2 = paste0('PC', i+1)
  print(pc.1)
  print(pc.2)
  
 pca_plot = pca$x %>% 
  as.data.frame() %>%
  rownames_to_column('Sample') %>%
  left_join(meta) %>% 
  ggplot(aes(x=.data[[pc.1]], y=.data[[pc.2]], color = Group, label = Nickname)) +
  scale_color_manual(values = pal) +
    geom_point(size = 4) +
  ggrepel::geom_text_repel(show.legend  = F, box.padding = 0.5, max.overlaps = Inf, color="black", size = 2) +
    theme_bw() +
    theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        aspect.ratio = 1, 
        legend.position = 'none') + 
  xlab(paste0(pc.1, ' ', round(var[i,]$pct, digits = 1), '%')) +
  ylab(paste0(pc.2, ' ', round(var[i+1,]$pct, digits = 1), '%')) 
print(pca_plot)
}
## [1] "PC1"
## [1] "PC2"
## Joining with `by = join_by(Sample)`
## [1] "PC2"
## [1] "PC3"
## Joining with `by = join_by(Sample)`

## [1] "PC3"
## [1] "PC4"
## Joining with `by = join_by(Sample)`

## [1] "PC4"
## [1] "PC5"
## Joining with `by = join_by(Sample)`

## [1] "PC5"
## [1] "PC6"
## Joining with `by = join_by(Sample)`

9.2 Baryawno

pattern = 'baryawno_n100.ssgsea.txt'
group_pal = c('WT' = 'blue', 'EZHIP' = 'red')

9.2.1 Cancer Cell Lines

atlas_prefix = 'baccin'

ssgsea_files = list.files(path = "/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/", 
           pattern = pattern, 
           full.names = T,
           recursive = T) %>%
  data.frame(group = c('hMSC_cell', 'KHOS_cell', 'MG63_cell', 'clinical', 'U2OS_cell', 'KHOS_xeno', 'MG63_xeno'))
names(ssgsea_files)[1] = c('path')
ssgsea_files = filter(ssgsea_files, group %in% c('KHOS_cell', 'MG63_cell', 'U2OS_cell'))

l = list()
for(g in ssgsea_files$group){
  print(g)
  
  l[[g]] = data.table::fread(ssgsea_files[ssgsea_files$group == g,]$path) 
}
## [1] "KHOS_cell"
## [1] "MG63_cell"
## [1] "U2OS_cell"
df = do.call(cbind, l) 
names(df)[1] = 'tmp' 
df = df %>% 
  select(!ends_with('ID')) %>%
  column_to_rownames('tmp')

meta = data.frame(Sample = names(df)) %>%
  separate(Sample, sep = '[.]', into = c('Comp', 'Sample_Short'), remove = F) %>%
  separate(Comp, sep = '_', into = c('Cell_Line', 'Type'), remove = F) %>%
  mutate(Group = ifelse(grepl('EZHIP|C1', Sample), 'EZHIP', 'WT'))
pca = prcomp(df %>% t)

var = tibble(PC = factor(1:length(pca$sdev^2)), 
                         variance = pca$sdev) %>% 
  mutate(pct = variance/sum(variance)*100)

for(i in 1:4){
  pc.1 = paste0('PC', i)
  pc.2 = paste0('PC', i+1)
  print(pc.1)
  print(pc.2)
  
 pca_plot = pca$x %>% 
  as.data.frame() %>%
  rownames_to_column('Sample') %>%
  left_join(meta) %>% 
  ggplot(aes(x=.data[[pc.1]], y=.data[[pc.2]], color = Group, label = Comp)) +
  scale_color_manual(values = group_pal) +
    geom_point(size = 4) +
  ggrepel::geom_text_repel(show.legend  = F, box.padding = 0.5, max.overlaps = Inf, color="black" ) +
    theme_bw() +
    theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        aspect.ratio = 1, 
        legend.position = 'none') + 
  xlab(paste0(pc.1, ' ', round(var[i,]$pct, digits = 1), '%')) +
  ylab(paste0(pc.2, ' ', round(var[i+1,]$pct, digits = 1), '%')) 
print(pca_plot)
}
## [1] "PC1"
## [1] "PC2"
## Joining with `by = join_by(Sample)`
## [1] "PC2"
## [1] "PC3"
## Joining with `by = join_by(Sample)`

## [1] "PC3"
## [1] "PC4"
## Joining with `by = join_by(Sample)`

## [1] "PC4"
## [1] "PC5"
## Joining with `by = join_by(Sample)`

9.2.2 Cancer Cell Lines + hMSC

ssgsea_files = list.files(path = "/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/", 
           pattern = pattern, 
           full.names = T,
           recursive = T) %>%
  data.frame(group = c('hMSC_cell', 'KHOS_cell', 'MG63_cell', 'clinical', 'U2OS_cell', 'KHOS_xeno', 'MG63_xeno'))
names(ssgsea_files)[1] = c('path')
ssgsea_files = filter(ssgsea_files, group %in% c('KHOS_cell', 'MG63_cell', 'U2OS_cell', 'hMSC_cell'))

l = list()
for(g in ssgsea_files$group){
  print(g)
  
  l[[g]] = data.table::fread(ssgsea_files[ssgsea_files$group == g,]$path) 
}
## [1] "hMSC_cell"
## [1] "KHOS_cell"
## [1] "MG63_cell"
## [1] "U2OS_cell"
df = do.call(cbind, l) 
names(df)[1] = 'tmp' 
df = df %>% 
  select(!ends_with('ID')) %>%
  column_to_rownames('tmp')

meta = data.frame(Sample = names(df)) %>%
  separate(Sample, sep = '[.]', into = c('Comp', 'Sample_Short'), remove = F) %>%
  separate(Comp, sep = '_', into = c('Cell_Line', 'Type'), remove = F) %>%
  mutate(Group = ifelse(grepl('EZHIP|C1', Sample), 'EZHIP', 'WT'))
pca = prcomp(df %>% t)

var = tibble(PC = factor(1:length(pca$sdev^2)), 
                         variance = pca$sdev) %>% 
  mutate(pct = variance/sum(variance)*100)

for(i in 1:5){
  pc.1 = paste0('PC', i)
  pc.2 = paste0('PC', i+1)
  print(pc.1)
  print(pc.2)
  
 pca_plot = pca$x %>% 
  as.data.frame() %>%
  rownames_to_column('Sample') %>%
  left_join(meta) %>% 
  ggplot(aes(x=.data[[pc.1]], y=.data[[pc.2]], color = Group, label = Comp)) +
  scale_color_manual(values = group_pal) +
    geom_point(size = 4) +
  ggrepel::geom_text_repel(show.legend  = F, box.padding = 0.5, max.overlaps = Inf, color="black" ) +
    theme_bw() +
    theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        aspect.ratio = 1, 
        legend.position = 'none') + 
  xlab(paste0(pc.1, ' ', round(var[i,]$pct, digits = 1), '%')) +
  ylab(paste0(pc.2, ' ', round(var[i+1,]$pct, digits = 1), '%')) 
print(pca_plot)
}
## [1] "PC1"
## [1] "PC2"
## Joining with `by = join_by(Sample)`
## [1] "PC2"
## [1] "PC3"
## Joining with `by = join_by(Sample)`

## [1] "PC3"
## [1] "PC4"
## Joining with `by = join_by(Sample)`

## [1] "PC4"
## [1] "PC5"
## Joining with `by = join_by(Sample)`

## [1] "PC5"
## [1] "PC6"
## Joining with `by = join_by(Sample)`

9.2.3 Cancer Cell Lines + Xgrafts

ssgsea_files = list.files(path = "/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/", 
           pattern = pattern, 
           full.names = T,
           recursive = T) %>%
  data.frame(group = c('hMSC_cell', 'KHOS_cell', 'MG63_cell', 'clinical', 'U2OS_cell', 'KHOS_xeno', 'MG63_xeno'))
names(ssgsea_files)[1] = c('path')
ssgsea_files = filter(ssgsea_files, group %in% c('KHOS_cell', 'MG63_cell', 'U2OS_cell', 'KHOS_xeno', 'MG63_xeno'))

l = list()
for(g in ssgsea_files$group){
  print(g)
  
  l[[g]] = data.table::fread(ssgsea_files[ssgsea_files$group == g,]$path) 
}
## [1] "KHOS_cell"
## [1] "MG63_cell"
## [1] "U2OS_cell"
## [1] "KHOS_xeno"
## [1] "MG63_xeno"
df = do.call(cbind, l) 
names(df)[1] = 'tmp' 
df = df %>% 
  select(!ends_with('ID')) %>%
  column_to_rownames('tmp')

meta = data.frame(Sample = names(df)) %>%
  separate(Sample, sep = '[.]', into = c('Comp', 'Sample_Short'), remove = F) %>%
  separate(Comp, sep = '_', into = c('Cell_Line', 'Type'), remove = F) %>%
  mutate(Group = ifelse(grepl('EZHIP|C1', Sample), 'EZHIP', 'WT'))
pca = prcomp(df %>% t)

var = tibble(PC = factor(1:length(pca$sdev^2)), 
                         variance = pca$sdev) %>% 
  mutate(pct = variance/sum(variance)*100)

for(i in 1:5){
  pc.1 = paste0('PC', i)
  pc.2 = paste0('PC', i+1)
  print(pc.1)
  print(pc.2)
  
 pca_plot = pca$x %>% 
  as.data.frame() %>%
  rownames_to_column('Sample') %>%
  left_join(meta) %>% 
  ggplot(aes(x=.data[[pc.1]], y=.data[[pc.2]], color = Group, label = Comp)) +
  scale_color_manual(values = group_pal) +
    geom_point(size = 4) +
  ggrepel::geom_text_repel(show.legend  = F, box.padding = 0.5, max.overlaps = Inf, color="black" ) +
    theme_bw() +
    theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        aspect.ratio = 1, 
        legend.position = 'none') + 
  xlab(paste0(pc.1, ' ', round(var[i,]$pct, digits = 1), '%')) +
  ylab(paste0(pc.2, ' ', round(var[i+1,]$pct, digits = 1), '%')) 
print(pca_plot)
}
## [1] "PC1"
## [1] "PC2"
## Joining with `by = join_by(Sample)`
## [1] "PC2"
## [1] "PC3"
## Joining with `by = join_by(Sample)`

## [1] "PC3"
## [1] "PC4"
## Joining with `by = join_by(Sample)`

## [1] "PC4"
## [1] "PC5"
## Joining with `by = join_by(Sample)`

## [1] "PC5"
## [1] "PC6"
## Joining with `by = join_by(Sample)`

9.2.4 KHOS MG63 Cell Lines + Xgrafts

ssgsea_files = list.files(path = "/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/", 
           pattern = pattern, 
           full.names = T,
           recursive = T) %>%
  data.frame(group = c('hMSC_cell', 'KHOS_cell', 'MG63_cell', 'clinical', 'U2OS_cell', 'KHOS_xeno', 'MG63_xeno'))
names(ssgsea_files)[1] = c('path')
ssgsea_files = filter(ssgsea_files, group %in% c('KHOS_cell', 'MG63_cell', 'KHOS_xeno', 'MG63_xeno'))

l = list()
for(g in ssgsea_files$group){
  print(g)
  
  l[[g]] = data.table::fread(ssgsea_files[ssgsea_files$group == g,]$path) 
}
## [1] "KHOS_cell"
## [1] "MG63_cell"
## [1] "KHOS_xeno"
## [1] "MG63_xeno"
df = do.call(cbind, l) 
names(df)[1] = 'tmp' 
df = df %>% 
  select(!ends_with('ID')) %>%
  column_to_rownames('tmp')

meta = data.frame(Sample = names(df)) %>%
  separate(Sample, sep = '[.]', into = c('Comp', 'Sample_Short'), remove = F) %>%
  separate(Comp, sep = '_', into = c('Cell_Line', 'Type'), remove = F) %>%
  mutate(Group = ifelse(grepl('EZHIP|C1', Sample), 'EZHIP', 'WT'))
pca = prcomp(df %>% t)

var = tibble(PC = factor(1:length(pca$sdev^2)), 
                         variance = pca$sdev) %>% 
  mutate(pct = variance/sum(variance)*100)

for(i in 1:5){
  pc.1 = paste0('PC', i)
  pc.2 = paste0('PC', i+1)
  print(pc.1)
  print(pc.2)
  
 pca_plot = pca$x %>% 
  as.data.frame() %>%
  rownames_to_column('Sample') %>%
  left_join(meta) %>% 
  ggplot(aes(x=.data[[pc.1]], y=.data[[pc.2]], color = Group, label = Comp)) +
  scale_color_manual(values = group_pal) +
    geom_point(size = 4) +
  ggrepel::geom_text_repel(show.legend  = F, box.padding = 0.5, max.overlaps = Inf, color="black" ) +
    theme_bw() +
    theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        aspect.ratio = 1, 
        legend.position = 'none') + 
  xlab(paste0(pc.1, ' ', round(var[i,]$pct, digits = 1), '%')) +
  ylab(paste0(pc.2, ' ', round(var[i+1,]$pct, digits = 1), '%')) 
print(pca_plot)
}
## [1] "PC1"
## [1] "PC2"
## Joining with `by = join_by(Sample)`
## [1] "PC2"
## [1] "PC3"
## Joining with `by = join_by(Sample)`

## [1] "PC3"
## [1] "PC4"
## Joining with `by = join_by(Sample)`

## [1] "PC4"
## [1] "PC5"
## Joining with `by = join_by(Sample)`

## [1] "PC5"
## [1] "PC6"
## Joining with `by = join_by(Sample)`

9.2.5 MG63 Cell Lines + Xgrafts

ssgsea_files = list.files(path = "/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/", 
           pattern = pattern, 
           full.names = T,
           recursive = T) %>%
  data.frame(group = c('hMSC_cell', 'KHOS_cell', 'MG63_cell', 'clinical', 'U2OS_cell', 'KHOS_xeno', 'MG63_xeno'))
names(ssgsea_files)[1] = c('path')
ssgsea_files = filter(ssgsea_files, group %in% c('MG63_cell', 'MG63_xeno'))

l = list()
for(g in ssgsea_files$group){
  print(g)
  
  l[[g]] = data.table::fread(ssgsea_files[ssgsea_files$group == g,]$path) 
}
## [1] "MG63_cell"
## [1] "MG63_xeno"
df = do.call(cbind, l) 
names(df)[1] = 'tmp' 
df = df %>% 
  select(!ends_with('ID')) %>%
  column_to_rownames('tmp')

meta = data.frame(Sample = names(df)) %>%
  separate(Sample, sep = '[.]', into = c('Comp', 'Sample_Short'), remove = F) %>%
  separate(Comp, sep = '_', into = c('Cell_Line', 'Type'), remove = F) %>%
  mutate(Group = ifelse(grepl('EZHIP|C1', Sample), 'EZHIP', 'WT'))
pca = prcomp(df %>% t)

var = tibble(PC = factor(1:length(pca$sdev^2)), 
                         variance = pca$sdev) %>% 
  mutate(pct = variance/sum(variance)*100)

for(i in 1:5){
  pc.1 = paste0('PC', i)
  pc.2 = paste0('PC', i+1)
  print(pc.1)
  print(pc.2)
  
 pca_plot = pca$x %>% 
  as.data.frame() %>%
  rownames_to_column('Sample') %>%
  left_join(meta) %>% 
  ggplot(aes(x=.data[[pc.1]], y=.data[[pc.2]], color = Group, label = Comp)) +
  scale_color_manual(values = group_pal) +
    geom_point(size = 4) +
  ggrepel::geom_text_repel(show.legend  = F, box.padding = 0.5, max.overlaps = Inf, color="black" ) +
    theme_bw() +
    theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        aspect.ratio = 1, 
        legend.position = 'none') + 
  xlab(paste0(pc.1, ' ', round(var[i,]$pct, digits = 1), '%')) +
  ylab(paste0(pc.2, ' ', round(var[i+1,]$pct, digits = 1), '%')) 
print(pca_plot)
}
## [1] "PC1"
## [1] "PC2"
## Joining with `by = join_by(Sample)`
## [1] "PC2"
## [1] "PC3"
## Joining with `by = join_by(Sample)`

## [1] "PC3"
## [1] "PC4"
## Joining with `by = join_by(Sample)`

## [1] "PC4"
## [1] "PC5"
## Joining with `by = join_by(Sample)`

## [1] "PC5"
## [1] "PC6"
## Joining with `by = join_by(Sample)`

9.2.6 KHOS Cell Lines + Xgrafts

atlas_prefix = 'baccin'
atlas_path = '/lustre06/project/6004736/alvann/from_narval/REFERENCES/Bone_Marrow_Baccin/out/01_cluster_markers/bone_atlas_baccin_gene_sets.n100.Rda'

ssgsea_files = list.files(path = "/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/", 
           pattern = pattern, 
           full.names = T,
           recursive = T) %>%
  data.frame(group = c('hMSC_cell', 'KHOS_cell', 'MG63_cell', 'clinical', 'U2OS_cell', 'KHOS_xeno', 'MG63_xeno'))
names(ssgsea_files)[1] = c('path')
ssgsea_files = filter(ssgsea_files, group %in% c('KHOS_cell', 'KHOS_xeno'))

l = list()
for(g in ssgsea_files$group){
  print(g)
  
  l[[g]] = data.table::fread(ssgsea_files[ssgsea_files$group == g,]$path) 
}
## [1] "KHOS_cell"
## [1] "KHOS_xeno"
df = do.call(cbind, l) 
names(df)[1] = 'tmp' 
df = df %>% 
  select(!ends_with('ID')) %>%
  column_to_rownames('tmp')

meta = data.frame(Sample = names(df)) %>%
  separate(Sample, sep = '[.]', into = c('Comp', 'Sample_Short'), remove = F) %>%
  separate(Comp, sep = '_', into = c('Cell_Line', 'Type'), remove = F) %>%
  mutate(Group = ifelse(grepl('EZHIP|C1', Sample), 'EZHIP', 'WT'))
pca = prcomp(df %>% t)

var = tibble(PC = factor(1:length(pca$sdev^2)), 
                         variance = pca$sdev) %>% 
  mutate(pct = variance/sum(variance)*100)

for(i in 1:5){
  pc.1 = paste0('PC', i)
  pc.2 = paste0('PC', i+1)
  print(pc.1)
  print(pc.2)
  
 pca_plot = pca$x %>% 
  as.data.frame() %>%
  rownames_to_column('Sample') %>%
  left_join(meta) %>% 
  ggplot(aes(x=.data[[pc.1]], y=.data[[pc.2]], color = Group, label = Comp)) +
  scale_color_manual(values = group_pal) +
    geom_point(size = 4) +
  ggrepel::geom_text_repel(show.legend  = F, box.padding = 0.5, max.overlaps = Inf, color="black" ) +
    theme_bw() +
    theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        aspect.ratio = 1, 
        legend.position = 'none') + 
  xlab(paste0(pc.1, ' ', round(var[i,]$pct, digits = 1), '%')) +
  ylab(paste0(pc.2, ' ', round(var[i+1,]$pct, digits = 1), '%')) 
print(pca_plot)
}
## [1] "PC1"
## [1] "PC2"
## Joining with `by = join_by(Sample)`
## [1] "PC2"
## [1] "PC3"
## Joining with `by = join_by(Sample)`

## [1] "PC3"
## [1] "PC4"
## Joining with `by = join_by(Sample)`

## [1] "PC4"
## [1] "PC5"
## Joining with `by = join_by(Sample)`

## [1] "PC5"
## [1] "PC6"
## Joining with `by = join_by(Sample)`

9.2.7 U2OS + Xgrafts

ssgsea_files = list.files(path = "/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/", 
           pattern = pattern, 
           full.names = T,
           recursive = T) %>%
  data.frame(group = c('hMSC_cell', 'KHOS_cell', 'MG63_cell', 'clinical', 'U2OS_cell', 'KHOS_xeno', 'MG63_xeno'))
names(ssgsea_files)[1] = c('path')
ssgsea_files = filter(ssgsea_files, group %in% c('U2OS_cell', 'KHOS_xeno', 'MG63_xeno'))

l = list()
for(g in ssgsea_files$group){
  print(g)
  
  l[[g]] = data.table::fread(ssgsea_files[ssgsea_files$group == g,]$path) 
}
## [1] "U2OS_cell"
## [1] "KHOS_xeno"
## [1] "MG63_xeno"
df = do.call(cbind, l) 
names(df)[1] = 'tmp' 
df = df %>% 
  select(!ends_with('ID')) %>%
  column_to_rownames('tmp')

meta = data.frame(Sample = names(df)) %>%
  separate(Sample, sep = '[.]', into = c('Comp', 'Sample_Short'), remove = F) %>%
  separate(Comp, sep = '_', into = c('Cell_Line', 'Type'), remove = F) %>%
  mutate(Group = ifelse(grepl('EZHIP|C1', Sample), 'EZHIP', 'WT'))
pca = prcomp(df %>% t)

var = tibble(PC = factor(1:length(pca$sdev^2)), 
                         variance = pca$sdev) %>% 
  mutate(pct = variance/sum(variance)*100)

for(i in 1:5){
  pc.1 = paste0('PC', i)
  pc.2 = paste0('PC', i+1)
  print(pc.1)
  print(pc.2)
  
 pca_plot = pca$x %>% 
  as.data.frame() %>%
  rownames_to_column('Sample') %>%
  left_join(meta) %>% 
  ggplot(aes(x=.data[[pc.1]], y=.data[[pc.2]], color = Group, label = Comp)) +
  scale_color_manual(values = group_pal) +
    geom_point(size = 4) +
  ggrepel::geom_text_repel(show.legend  = F, box.padding = 0.5, max.overlaps = Inf, color="black" ) +
    theme_bw() +
    theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        aspect.ratio = 1, 
        legend.position = 'none') + 
  xlab(paste0(pc.1, ' ', round(var[i,]$pct, digits = 1), '%')) +
  ylab(paste0(pc.2, ' ', round(var[i+1,]$pct, digits = 1), '%')) 
print(pca_plot)
}
## [1] "PC1"
## [1] "PC2"
## Joining with `by = join_by(Sample)`
## [1] "PC2"
## [1] "PC3"
## Joining with `by = join_by(Sample)`

## [1] "PC3"
## [1] "PC4"
## Joining with `by = join_by(Sample)`

## [1] "PC4"
## [1] "PC5"
## Joining with `by = join_by(Sample)`

## [1] "PC5"
## [1] "PC6"
## Joining with `by = join_by(Sample)`

9.2.8 Xgrafts

ssgsea_files = list.files(path = "/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/", 
           pattern = pattern, 
           full.names = T,
           recursive = T) %>%
  data.frame(group = c('hMSC_cell', 'KHOS_cell', 'MG63_cell', 'clinical', 'U2OS_cell', 'KHOS_xeno', 'MG63_xeno'))
names(ssgsea_files)[1] = c('path')
ssgsea_files = filter(ssgsea_files, group %in% c('KHOS_xeno', 'MG63_xeno'))

l = list()
for(g in ssgsea_files$group){
  print(g)
  
  l[[g]] = data.table::fread(ssgsea_files[ssgsea_files$group == g,]$path) 
}
## [1] "KHOS_xeno"
## [1] "MG63_xeno"
df = do.call(cbind, l) 
names(df)[1] = 'tmp' 
df = df %>% 
  select(!ends_with('ID')) %>%
  column_to_rownames('tmp')

meta = data.frame(Sample = names(df)) %>%
  separate(Sample, sep = '[.]', into = c('Comp', 'Sample_Short'), remove = F) %>%
  separate(Comp, sep = '_', into = c('Cell_Line', 'Type'), remove = F) %>%
  mutate(Group = ifelse(grepl('EZHIP|C1', Sample), 'EZHIP', 'WT'))
pca = prcomp(df %>% t)

var = tibble(PC = factor(1:length(pca$sdev^2)), 
                         variance = pca$sdev) %>% 
  mutate(pct = variance/sum(variance)*100)

for(i in 1:5){
  pc.1 = paste0('PC', i)
  pc.2 = paste0('PC', i+1)
  print(pc.1)
  print(pc.2)
  
 pca_plot = pca$x %>% 
  as.data.frame() %>%
  rownames_to_column('Sample') %>%
  left_join(meta) %>% 
  ggplot(aes(x=.data[[pc.1]], y=.data[[pc.2]], color = Group, label = Comp)) +
  scale_color_manual(values = group_pal) +
    geom_point(size = 4) +
  ggrepel::geom_text_repel(show.legend  = F, box.padding = 0.5, max.overlaps = Inf, color="black" ) +
    theme_bw() +
    theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        aspect.ratio = 1, 
        legend.position = 'none') + 
  xlab(paste0(pc.1, ' ', round(var[i,]$pct, digits = 1), '%')) +
  ylab(paste0(pc.2, ' ', round(var[i+1,]$pct, digits = 1), '%')) 
print(pca_plot)
}
## [1] "PC1"
## [1] "PC2"
## Joining with `by = join_by(Sample)`
## [1] "PC2"
## [1] "PC3"
## Joining with `by = join_by(Sample)`

## [1] "PC3"
## [1] "PC4"
## Joining with `by = join_by(Sample)`

## [1] "PC4"
## [1] "PC5"
## Joining with `by = join_by(Sample)`

## [1] "PC5"
## [1] "PC6"
## Joining with `by = join_by(Sample)`

9.2.9 Clinical

group_pal = c('EZHIP_neg' = 'blue', 'EZHIP_pos' = 'red', 'H3K27me3_Low' = 'deepskyblue')
clical_meta = '/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/patient_clinical_seq/out/01_raptor_exprtools/EZHIP_neg_VsEZHIP_pos_no_reps/info.samples.tsv'

ssgsea_files = list.files(path = "/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/", 
           pattern = pattern, 
           full.names = T,
           recursive = T) %>%
  data.frame(group = c('hMSC_cell', 'KHOS_cell', 'MG63_cell', 'clinical', 'U2OS_cell', 'KHOS_xeno', 'MG63_xeno'))
names(ssgsea_files)[1] = c('path')
ssgsea_files = filter(ssgsea_files, group %in% c('clinical'))

l = list()
for(g in ssgsea_files$group){
  print(g)
  
  l[[g]] = data.table::fread(ssgsea_files[ssgsea_files$group == g,]$path) 
}
## [1] "clinical"
df = do.call(cbind, l) 
names(df)[1] = 'tmp' 
df = df %>% 
  select(!ends_with('ID')) %>%
  column_to_rownames('tmp')

meta = data.table::fread(clical_meta) %>% 
  mutate(Sample = paste0('clinical.', Nickname)) 

pal = c('EZHIP_pos' = 'red', 'EZHIP_neg' = 'blue')
pca = prcomp(df %>% t)

var = tibble(PC = factor(1:length(pca$sdev^2)), 
                         variance = pca$sdev) %>% 
  mutate(pct = variance/sum(variance)*100)

for(i in 1:5){
  pc.1 = paste0('PC', i)
  pc.2 = paste0('PC', i+1)
  print(pc.1)
  print(pc.2)
  
 pca_plot = pca$x %>% 
  as.data.frame() %>%
  rownames_to_column('Sample') %>%
  left_join(meta) %>% 
  ggplot(aes(x=.data[[pc.1]], y=.data[[pc.2]], color = Group, label = Nickname)) +
  scale_color_manual(values = pal) +
    geom_point(size = 4) +
  ggrepel::geom_text_repel(show.legend  = F, box.padding = 0.5, max.overlaps = Inf, color="black", size = 2) +
    theme_bw() +
    theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        aspect.ratio = 1, 
        legend.position = 'none') + 
  xlab(paste0(pc.1, ' ', round(var[i,]$pct, digits = 1), '%')) +
  ylab(paste0(pc.2, ' ', round(var[i+1,]$pct, digits = 1), '%')) 
print(pca_plot)
}
## [1] "PC1"
## [1] "PC2"
## Joining with `by = join_by(Sample)`
## [1] "PC2"
## [1] "PC3"
## Joining with `by = join_by(Sample)`

## [1] "PC3"
## [1] "PC4"
## Joining with `by = join_by(Sample)`

## [1] "PC4"
## [1] "PC5"
## Joining with `by = join_by(Sample)`

## [1] "PC5"
## [1] "PC6"
## Joining with `by = join_by(Sample)`

pca = prcomp(df %>% t)

var = tibble(PC = factor(1:length(pca$sdev^2)), 
                         variance = pca$sdev) %>% 
  mutate(pct = variance/sum(variance)*100)

for(i in 1:5){
  pc.1 = paste0('PC', i)
  pc.2 = paste0('PC', i+1)
  print(pc.1)
  print(pc.2)
  
 pca_plot = pca$x %>% 
  as.data.frame() %>%
  rownames_to_column('Sample') %>%
  left_join(meta) %>% 
  ggplot(aes(x=.data[[pc.1]], y=.data[[pc.2]], color = Sex, label = Nickname)) +
  scale_color_manual(values = c('green', 'orange')) +
    geom_point(size = 4) +
  ggrepel::geom_text_repel(show.legend  = F, box.padding = 0.5, max.overlaps = Inf, color="black", size = 2) +
    theme_bw() +
    theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        aspect.ratio = 1) + 
  xlab(paste0(pc.1, ' ', round(var[i,]$pct, digits = 1), '%')) +
  ylab(paste0(pc.2, ' ', round(var[i+1,]$pct, digits = 1), '%')) 
print(pca_plot)
}
## [1] "PC1"
## [1] "PC2"
## Joining with `by = join_by(Sample)`
## [1] "PC2"
## [1] "PC3"
## Joining with `by = join_by(Sample)`

## [1] "PC3"
## [1] "PC4"
## Joining with `by = join_by(Sample)`

## [1] "PC4"
## [1] "PC5"
## Joining with `by = join_by(Sample)`

## [1] "PC5"
## [1] "PC6"
## Joining with `by = join_by(Sample)`

pca = prcomp(df %>% t)

var = tibble(PC = factor(1:length(pca$sdev^2)), 
                         variance = pca$sdev) %>% 
  mutate(pct = variance/sum(variance)*100)

for(i in 1:5){
  pc.1 = paste0('PC', i)
  pc.2 = paste0('PC', i+1)
  print(pc.1)
  print(pc.2)
  
 pca_plot = pca$x %>% 
  as.data.frame() %>%
  rownames_to_column('Sample') %>%
  left_join(meta) %>% 
  ggplot(aes(x=.data[[pc.1]], y=.data[[pc.2]], color = Age, label = Nickname)) +
    geom_point(size = 4) +
  ggrepel::geom_text_repel(show.legend  = F, box.padding = 0.5, max.overlaps = Inf, color="black", size = 2) +
    theme_bw() +
    theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        aspect.ratio = 1) + 
  xlab(paste0(pc.1, ' ', round(var[i,]$pct, digits = 1), '%')) +
  ylab(paste0(pc.2, ' ', round(var[i+1,]$pct, digits = 1), '%')) 
print(pca_plot)
}
## [1] "PC1"
## [1] "PC2"
## Joining with `by = join_by(Sample)`
## [1] "PC2"
## [1] "PC3"
## Joining with `by = join_by(Sample)`

## [1] "PC3"
## [1] "PC4"
## Joining with `by = join_by(Sample)`

## [1] "PC4"
## [1] "PC5"
## Joining with `by = join_by(Sample)`

## [1] "PC5"
## [1] "PC6"
## Joining with `by = join_by(Sample)`

meta = meta %>% filter(Nickname %in% samples)
df = df[,meta$Sample]
pca = prcomp(df %>% t)

var = tibble(PC = factor(1:length(pca$sdev^2)), 
                         variance = pca$sdev) %>% 
  mutate(pct = variance/sum(variance)*100)

for(i in 1:5){
  pc.1 = paste0('PC', i)
  pc.2 = paste0('PC', i+1)
  print(pc.1)
  print(pc.2)
  
 pca_plot = pca$x %>% 
  as.data.frame() %>%
  rownames_to_column('Sample') %>%
  left_join(meta) %>% 
  ggplot(aes(x=.data[[pc.1]], y=.data[[pc.2]], color = Group, label = Nickname)) +
  scale_color_manual(values = pal) +
    geom_point(size = 4) +
  ggrepel::geom_text_repel(show.legend  = F, box.padding = 0.5, max.overlaps = Inf, color="black", size = 2) +
    theme_bw() +
    theme(text = element_text(size = 15),
        axis.line = element_line(size = 0.5),
        panel.border = element_blank(),
        panel.grid = element_blank(),
        legend.title = element_blank(), 
        aspect.ratio = 1, 
        legend.position = 'none') + 
  xlab(paste0(pc.1, ' ', round(var[i,]$pct, digits = 1), '%')) +
  ylab(paste0(pc.2, ' ', round(var[i+1,]$pct, digits = 1), '%')) 
print(pca_plot)
}
## [1] "PC1"
## [1] "PC2"
## Joining with `by = join_by(Sample)`
## [1] "PC2"
## [1] "PC3"
## Joining with `by = join_by(Sample)`

## [1] "PC3"
## [1] "PC4"
## Joining with `by = join_by(Sample)`

## [1] "PC4"
## [1] "PC5"
## Joining with `by = join_by(Sample)`

## [1] "PC5"
## [1] "PC6"
## Joining with `by = join_by(Sample)`